diff --git a/Cargo.lock b/Cargo.lock
index 5e85e384869..c1251832983 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2706,9 +2706,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry"
-version = "0.23.0"
+version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76"
+checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -2819,19 +2819,17 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.23.0"
+version = "0.24.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ae312d58eaa90a82d2e627fd86e075cf5230b3f11794e2ed74199ebbe572d4fd"
+checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
 dependencies = [
  "async-trait",
  "futures-channel",
  "futures-executor",
  "futures-util",
  "glob",
- "lazy_static",
  "once_cell",
- "opentelemetry 0.23.0",
- "ordered-float 4.3.0",
+ "opentelemetry 0.24.0",
  "percent-encoding",
  "rand",
  "thiserror",
@@ -4185,16 +4183,17 @@ dependencies = [
  "cmake",
  "cxx",
  "cxx-build",
+ "hashbrown 0.14.5",
+ "hf-hub",
  "log",
- "parking_lot",
  "pkg-config",
  "text-generation-router",
  "thiserror",
- "tokenizers 0.19.1",
+ "tokenizers",
  "tokio",
  "tokio-stream",
  "tracing",
- "tracing-opentelemetry 0.24.0",
+ "tracing-opentelemetry 0.25.0",
  "tracing-subscriber",
 ]
 
@@ -4212,7 +4211,7 @@ dependencies = [
  "tabled",
  "text-generation-client",
  "thiserror",
- "tokenizers 0.20.0",
+ "tokenizers",
  "tokio",
  "tracing",
  "tracing-subscriber",
@@ -4292,7 +4291,7 @@ dependencies = [
  "serde_json",
  "sysinfo",
  "thiserror",
- "tokenizers 0.20.0",
+ "tokenizers",
  "tokio",
  "tokio-stream",
  "tower-http",
@@ -4341,7 +4340,7 @@ dependencies = [
  "slotmap",
  "text-generation-router",
  "thiserror",
- "tokenizers 0.20.0",
+ "tokenizers",
  "tokio",
  "tokio-stream",
  "tonic 0.10.2",
@@ -4392,7 +4391,7 @@ dependencies = [
  "slotmap",
  "text-generation-router",
  "thiserror",
- "tokenizers 0.20.0",
+ "tokenizers",
  "tokio",
  "tokio-stream",
  "tonic 0.10.2",
@@ -4514,39 +4513,6 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
-[[package]]
-name = "tokenizers"
-version = "0.19.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd"
-dependencies = [
- "aho-corasick",
- "derive_builder",
- "esaxx-rs",
- "getrandom",
- "hf-hub",
- "indicatif",
- "itertools 0.12.1",
- "lazy_static",
- "log",
- "macro_rules_attribute",
- "monostate",
- "onig",
- "paste",
- "rand",
- "rayon",
- "rayon-cond",
- "regex",
- "regex-syntax 0.8.5",
- "serde",
- "serde_json",
- "spm_precompiled",
- "thiserror",
- "unicode-normalization-alignments",
- "unicode-segmentation",
- "unicode_categories",
-]
-
 [[package]]
 name = "tokenizers"
 version = "0.20.0"
@@ -4933,14 +4899,14 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.24.0"
+version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f68803492bf28ab40aeccaecc7021096bd256baf7ca77c3d425d89b35a7be4e4"
+checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
 dependencies = [
  "js-sys",
  "once_cell",
- "opentelemetry 0.23.0",
- "opentelemetry_sdk 0.23.0",
+ "opentelemetry 0.24.0",
+ "opentelemetry_sdk 0.24.1",
  "smallvec",
  "tracing",
  "tracing-core",
diff --git a/Dockerfile.trtllm b/Dockerfile.trtllm
deleted file mode 100644
index 4543ae804ee..00000000000
--- a/Dockerfile.trtllm
+++ /dev/null
@@ -1,23 +0,0 @@
-# All the tooling for CUDA
-FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS cuda-builder
-
-WORKDIR /usr/src/tgi/backends/trtllm
-RUN apt update && apt install -y cmake git git-lfs gcc g++ ninja-build libopenmpi-dev python3-dev python3-pip wget
-
-COPY . /usr/src/tgi
-RUN chmod +x scripts/install_tensorrt.sh && scripts/install_tensorrt.sh
-RUN cmake -G Ninja -B build -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include .
-RUN cmake --build build --parallel -t tgi_trtllm_backend_impl
-
-# All the tooling for Rust
-FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef
-WORKDIR /usr/src
-
-# Include CUDA related libraries and tools to the Rust based image
-COPY --from=cuda-builder /usr/local/cuda /usr/local/cuda
-COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt
-COPY --from=cuda-builder /usr/src/tgi/backends/trtllm/build /usr/local/tgi/trtllm/build
-ENV PATH=/usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH
-
-RUN apt update && apt install -y cmake git gcc g++ ninja-build libopenmpi3
diff --git a/Dockerfile_amd b/Dockerfile_amd
index 4bb6407a31b..b84d4edd802 100644
--- a/Dockerfile_amd
+++ b/Dockerfile_amd
@@ -327,7 +327,8 @@ ENV ROCM_USE_CUSTOM_PAGED_ATTN=1
 ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0
 ENV VLLM_MOE_PADDING=0
 ENV ATTENTION=paged
-ENV USE_PREFIX_CACHING=0
+ENV PREFIX_CACHING=0
+ENV PREFILL_CHUNKING=0
 ENV ROCM_USE_SKINNY_GEMM=1
 
 COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh
diff --git a/Dockerfile_intel b/Dockerfile_intel
index 9b5dd20a79b..96f242489ab 100644
--- a/Dockerfile_intel
+++ b/Dockerfile_intel
@@ -218,7 +218,8 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/lo
 
 FROM ${PLATFORM} AS final
 ENV ATTENTION=paged
-ENV USE_PREFIX_CACHING=0
+ENV PREFIX_CACHING=0
+ENV PREFILL_CHUNKING=0
 ENV CUDA_GRAPHS=0
 ENTRYPOINT ["text-generation-launcher"]
 CMD ["--json-output"]
diff --git a/backends/trtllm/Dockerfile b/Dockerfile_trtllm
similarity index 85%
rename from backends/trtllm/Dockerfile
rename to Dockerfile_trtllm
index 5fd2f89f25f..3ccb0310bea 100644
--- a/backends/trtllm/Dockerfile
+++ b/Dockerfile_trtllm
@@ -10,7 +10,7 @@ COPY . .
 RUN cargo chef prepare --recipe-path recipe.json
 
 # CUDA dependent dependencies resolver stage
-FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder
+FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder
 
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     --mount=type=cache,target=/var/lib/apt,sharing=locked \
@@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     ninja-build \
     pkg-config \
     python3 \
+    python3-dev \
     python3-setuptools \
     tar \
     wget
@@ -42,7 +43,7 @@ RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILE
     mkdir /usr/src/mpi && \
     tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \
     cd /usr/src/mpi && \
-    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda && \
+    ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \
     make -j all && \
     make install && \
     rm -rf "/opt/src/$OMPI_TARBALL_FILENAME"
@@ -82,10 +83,16 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$
     cd backends/trtllm && \
     CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release
 
-FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime
+FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime
+RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \
+    rm -rf /var/lib/{apt,dpkg,cache,log}/ && \
+    python3 -m pip install transformers tokenizers
+
 WORKDIR /usr/local/tgi/bin
 
-ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
+ENV TOKENIZERS_PARALLELISM=false
+ENV OMPI_MCA_plm_rsh_agent=""
 
 COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi
 COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt
diff --git a/README.md b/README.md
index 25dbbd437c6..fb475b097dd 100644
--- a/README.md
+++ b/README.md
@@ -98,7 +98,7 @@ curl 127.0.0.1:8080/generate_stream \
 You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses.
 
 ```bash
-curl localhost:3000/v1/chat/completions \
+curl localhost:8080/v1/chat/completions \
     -X POST \
     -d '{
   "model": "tgi",
diff --git a/backends/client/src/v3/client.rs b/backends/client/src/v3/client.rs
index 479d31bf290..d43f789e7ca 100644
--- a/backends/client/src/v3/client.rs
+++ b/backends/client/src/v3/client.rs
@@ -158,7 +158,8 @@ impl Client {
                 // Blocks and slots will be set on the server side if we use paged attention
                 blocks: vec![],
                 slots: vec![],
-                prefix_len: 0,
+                cache_len: 0,
+                chunk_len: None,
                 // Set sampling parameters to also take these ops into account in the max memory
                 parameters: Some(NextTokenChooserParameters {
                     temperature: 0.9,
@@ -217,8 +218,13 @@ impl Client {
     pub async fn prefill(
         &mut self,
         batch: Batch,
+        cached_batch: Option<CachedBatch>,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let request = tonic::Request::new(PrefillRequest {
+            batch: Some(batch),
+            cached_batch,
+        })
+        .inject_context();
         let response = self.stub.prefill(request).await?.into_inner();
         Ok((
             response.generations,
diff --git a/backends/client/src/v3/sharded_client.rs b/backends/client/src/v3/sharded_client.rs
index 645c076a26b..854a5895eba 100644
--- a/backends/client/src/v3/sharded_client.rs
+++ b/backends/client/src/v3/sharded_client.rs
@@ -134,11 +134,12 @@ impl ShardedClient {
     pub async fn prefill(
         &mut self,
         batch: Batch,
+        cached_batch: Option<CachedBatch>,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
         let futures: Vec<_> = self
             .clients
             .iter_mut()
-            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
             .collect();
         #[allow(clippy::type_complexity)]
         let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
@@ -245,7 +246,8 @@ impl Health for ShardedClient {
             // Block 0 is reserved for health checks
             blocks: vec![0],
             slots: (0..16).collect(),
-            prefix_len: 0,
+            cache_len: 0,
+            chunk_len: None,
             adapter_id: None,
         };
         let batch = Batch {
@@ -255,7 +257,7 @@ impl Health for ShardedClient {
             max_tokens: 2,
             max_blocks: 1,
         };
-        self.clone().prefill(batch).await?;
+        self.clone().prefill(batch, None).await?;
         Ok(())
     }
 }
diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt
index 425b2d7b9a5..831372cdf99 100644
--- a/backends/trtllm/CMakeLists.txt
+++ b/backends/trtllm/CMakeLists.txt
@@ -1,5 +1,17 @@
 cmake_minimum_required(VERSION 3.20)
 
+if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug")
+    find_program(CCACHE_EXECUTABLE "ccache")
+    if (CCACHE_EXECUTABLE)
+        message(STATUS "Using ccache")
+        set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE)
+    endif ()
+endif ()
+
+if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+    cmake_policy(SET CMP0135 NEW)
+endif ()
+
 project(tgi-trtllm-backend VERSION 1.0.0)
 set(CMAKE_CXX_STANDARD 20)
 
@@ -14,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include"
 set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located")
 
 # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features
-find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
+find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml)
 
 #### External dependencies ####
 include(cmake/fmt.cmake)
diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml
index 43a114ba4b0..97ef1a76891 100644
--- a/backends/trtllm/Cargo.toml
+++ b/backends/trtllm/Cargo.toml
@@ -10,16 +10,17 @@ async-trait = "0.1"
 async-stream = "0.3"
 clap = { version = "4.5", features = ["derive"] }
 cxx = "1.0"
+hashbrown = "0.14"
+hf-hub = { workspace = true }
 log = { version = "0.4", features = [] }
 text-generation-router = { path = "../../router" }
-tokenizers = { version = "0.19", features = ["hf-hub"] }
-tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
+tokenizers = { workspace = true }
+tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] }
 tokio-stream = "0.1.15"
-thiserror = "1.0.62"
+thiserror = "1.0.63"
 tracing = "0.1"
-tracing-opentelemetry = "0.24"
+tracing-opentelemetry = "0.25"
 tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] }
-parking_lot = "0.12"
 
 [build-dependencies]
 cmake = "0.1"
diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs
index 08638262438..985019260b8 100644
--- a/backends/trtllm/build.rs
+++ b/backends/trtllm/build.rs
@@ -6,7 +6,7 @@ use std::path::{absolute, PathBuf};
 
 const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"];
 const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST");
-const CUDA_REQUIRED_VERSION: &str = "12.5";
+const CUDA_REQUIRED_VERSION: &str = "12.6";
 const MPI_REQUIRED_VERSION: &str = "4.1";
 const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX");
 const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR");
@@ -36,7 +36,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     // Build the backend implementation through CMake
     let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi");
     let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt");
-    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default
+    let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real");
 
     let mut install_path = PathBuf::from(install_path);
     if !install_path.is_absolute() {
@@ -81,7 +81,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf
     (PathBuf::from(install_path), deps_folder)
 }
 
-fn build_ffi_layer(deps_folder: &PathBuf) {
+fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) {
+    let ndebug = match is_debug {
+        true => "1",
+        false => "0",
+    };
+
     CFG.include_prefix = "backends/trtllm";
     cxx_build::bridge("src/lib.rs")
         .static_flag(true)
@@ -93,9 +98,14 @@ fn build_ffi_layer(deps_folder: &PathBuf) {
         .include("/usr/local/tensorrt/include")
         .file("src/ffi.cpp")
         .std("c++20")
+        .define("NDEBUG", ndebug)
         .compile("tgi_trtllm_backend");
 
     println!("cargo:rerun-if-changed=CMakeLists.txt");
+    println!("cargo:rerun-if-changed=cmake/trtllm.cmake");
+    println!("cargo:rerun-if-changed=cmake/json.cmake");
+    println!("cargo:rerun-if-changed=cmake/fmt.cmake");
+    println!("cargo:rerun-if-changed=cmake/spdlog.cmake");
     println!("cargo:rerun-if-changed=include/backend.h");
     println!("cargo:rerun-if-changed=lib/backend.cpp");
     println!("cargo:rerun-if-changed=include/ffi.h");
@@ -115,7 +125,7 @@ fn main() {
     let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir);
 
     // Build the FFI layer calling the backend above
-    build_ffi_layer(&deps_folder);
+    build_ffi_layer(&deps_folder, is_debug);
 
     // Emit linkage search path
     probe!("ompi", MPI_REQUIRED_VERSION);
diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake
index f94a9c5668f..afd6ea5f090 100644
--- a/backends/trtllm/cmake/fmt.cmake
+++ b/backends/trtllm/cmake/fmt.cmake
@@ -1,6 +1,6 @@
 FetchContent_Declare(
         fmt
-        GIT_REPOSITORY https://github.com/fmtlib/fmt
-        GIT_TAG 11.0.1
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz
 )
 FetchContent_MakeAvailable(fmt)
diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake
index 29e5753b341..67eff2fe606 100644
--- a/backends/trtllm/cmake/json.cmake
+++ b/backends/trtllm/cmake/json.cmake
@@ -1,5 +1,6 @@
 fetchcontent_declare(
         json
+        DOWNLOAD_EXTRACT_TIMESTAMP
         URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz
 )
 fetchcontent_makeavailable(json)
diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake
index c4ee5c97a58..7f529a7d29e 100644
--- a/backends/trtllm/cmake/spdlog.cmake
+++ b/backends/trtllm/cmake/spdlog.cmake
@@ -11,7 +11,7 @@ endif ()
 
 fetchcontent_declare(
         spdlog
-        GIT_REPOSITORY https://github.com/gabime/spdlog.git
-        GIT_TAG v1.14.1
+        DOWNLOAD_EXTRACT_TIMESTAMP
+        URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz
 )
 fetchcontent_makeavailable(spdlog)
diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake
index e59ad4cf3a6..5f1b6c19c01 100644
--- a/backends/trtllm/cmake/trtllm.cmake
+++ b/backends/trtllm/cmake/trtllm.cmake
@@ -23,8 +23,9 @@ endif ()
 fetchcontent_declare(
         trtllm
         GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git
-        GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1
+        GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc
         GIT_SHALLOW FALSE
+        DOWNLOAD_EXTRACT_TIMESTAMP
 )
 fetchcontent_makeavailable(trtllm)
 
diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h
index 7990e76b90d..d23f6288964 100644
--- a/backends/trtllm/include/backend.h
+++ b/backends/trtllm/include/backend.h
@@ -5,6 +5,7 @@
 #ifndef TGI_TRTLLM_BACKEND_H
 #define TGI_TRTLLM_BACKEND_H
 
+#include <array>
 #include <cmath>
 #include <filesystem>
 #include <span>
@@ -19,16 +20,33 @@
 using json = nlohmann::json;
 namespace tle = tensorrt_llm::executor;
 
+
+#define CAST_SIZETYPE(x) static_cast<tle::SizeType32>(x)
+
 namespace huggingface::tgi::backends {
     using RequestId = tle::IdType;
     using TokenId = tle::TokenIdType;
 
+    const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false);
+    constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING(
+            "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})");
+    constexpr auto FMT_EXECUTOR_STATS = FMT_STRING(
+            "Submitting inference [{}] to the executor ({:d} already in-flight)");
+    constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING(
+            "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}");
+
     /**
      * Initialize all the components required by TRTLLM.
      * It is required to call this function before attempting to load any engine
      */
     void InitializeBackend();
 
+    /**
+     * Initialize logging mechanism
+     */
+    void InitializeLogging();
+
+
     /**
      *
      * @param config TensorRT-LLM configuration object
@@ -37,6 +55,14 @@ namespace huggingface::tgi::backends {
      */
     tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath);
 
+    /**
+     *
+     * @param worldSize
+     * @param workerPath
+     * @return
+     */
+    tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept;
+
     /**
      * Get the sampling configuration from the parameters provided by TGI
      * @param topK
@@ -54,7 +80,15 @@ namespace huggingface::tgi::backends {
             float_t repetition_penalty,
             float_t frequency_penalty,
             uint64_t seed
-    );
+    ) noexcept;
+
+    /**
+     * Attempt to retrieve the
+     * @param generationConfigPath
+     * @return
+     */
+    std::optional<std::list<std::vector<TokenId>>>
+    GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept;
 
     /**
      *
@@ -64,18 +98,16 @@ namespace huggingface::tgi::backends {
         const json config;
         tle::Executor executor;
 
+        /** Frequently accessed variables cached here **/
+        uint32_t maxNumTokens;
+        std::list<std::vector<TokenId>> stopWords;
+
     public:
         explicit TensorRtLlmBackend(
                 const std::filesystem::path &engineFolder,
                 const std::filesystem::path &executorWorker
         );
 
-        /**
-         * Indicate if the backend is ready to accept incoming request
-         * @return true if ready, false otherwise
-         */
-        [[nodiscard]] bool IsReady() const;
-
         /**
          * Query the executor for the number of token available for pulling
          * @return
@@ -88,32 +120,23 @@ namespace huggingface::tgi::backends {
          * @param topK
          * @param topP
          * @param temperature
-         * @param repetition_penalty
-         * @param frequency_penalty
+         * @param repetitionPenalty
+         * @param frequencyPenalty
          * @param seed
          * @return Request id related to this generation for reference
          */
         [[nodiscard]] RequestId Submit(
                 const std::vector<TokenId> &tokens,
+                uint32_t maxNewTokens,
                 int32_t topK,
                 float_t topP,
                 float_t temperature,
-                float_t repetition_penalty,
-                float_t frequency_penalty,
+                float_t repetitionPenalty,
+                float_t frequencyPenalty,
                 uint64_t seed
         );
 
-        /**
-         *
-         * @param requestId The request id to poll the generation results
-         * @return
-         */
-        std::vector<tle::Response> Poll(RequestId requestId);
-
-        /**
-         * Stop the underlying executor
-         */
-        void Shutdown();
+        [[nodiscard]] std::vector<tle::Response> PullNewTokens();
     };
 }
 
diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h
index fe0be9fc820..449bcd4d739 100644
--- a/backends/trtllm/include/ffi.h
+++ b/backends/trtllm/include/ffi.h
@@ -5,19 +5,30 @@
 #ifndef TGI_TRTLLM_BACKEND_FFI_H
 #define TGI_TRTLLM_BACKEND_FFI_H
 
+#include <cmath>
 #include <cstddef>
+#include <memory>
 #include "backend.h"
 
 namespace huggingface::tgi::backends {
     class TensorRtLlmBackendImpl;
 }
 
-#include "backends/trtllm/src/lib.rs.h"
+// Template to support returning error from TllmException back to Rust in a Result<>
+#include <tensorrt_llm/common/tllmException.h>
 
+namespace rust::behavior {
+    template<typename Try, typename Fail>
+    static void trycatch(Try &&func, Fail &&fail) noexcept try {
+        func();
+    } catch (tensorrt_llm::common::TllmException &e) {
+        fail(e.what());
+    }
+}
 
-namespace huggingface::tgi::backends {
+#include "backends/trtllm/src/lib.rs.h"
 
-//    struct GenerationContext;
+namespace huggingface::tgi::backends {
 
     class TensorRtLlmBackendImpl : public TensorRtLlmBackend {
     public:
@@ -28,15 +39,10 @@ namespace huggingface::tgi::backends {
          */
         TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker);
 
-        /***
-         *
-         * @return
-         */
-        bool IsReady() const;
-
         /***
          *
          * @param tokens
+         * @param maxNewTokens
          * @param topK
          * @param topP
          * @param temperature
@@ -47,21 +53,15 @@ namespace huggingface::tgi::backends {
          */
         [[nodiscard("returned request id should be used to refer to the request's generation result later on")]]
         uint64_t
-        Submit(rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature,
+        Submit(rust::Slice<const uint32_t> tokens, uint32_t maxNewTokens,
+               int32_t topK, float_t topP, float_t temperature,
                float_t repetition_penalty, float_t frequency_penalty, uint64_t seed);
 
         /***
          *
-         * @param requestId
-         * @param ctx
-         * @param callback
          * @return
          */
-        size_t StreamTokens(
-                const RequestId requestId,
-                huggingface::tgi::backends::GenerationContext *ctx,
-                rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
-                              huggingface::tgi::backends::GenerationStep)> callback);
+        std::unique_ptr<std::vector<GenerationStep>> PullTokens();
     };
 
     /***
diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h
index da0bf4f3c0d..9633495f4fd 100644
--- a/backends/trtllm/include/hardware.h
+++ b/backends/trtllm/include/hardware.h
@@ -14,7 +14,7 @@
 namespace huggingface::hardware::cuda {
 
 #define AMPERE_SM_MAJOR 8
-#define HOPPER_SM_MAJOR 8
+#define HOPPER_SM_MAJOR 9
 
     /**
      * Store information about the version of the CUDA Compute Capabilities detected on the device
@@ -23,9 +23,9 @@ namespace huggingface::hardware::cuda {
         int32_t major;
         int32_t minor;
 
-        [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
+        [[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; }
 
-        [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; }
+        [[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; }
     };
 
     CudaComputeCapabilities GetCudaComputeCapabilities() {
diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp
index c066a6d6eab..4dd41de0072 100644
--- a/backends/trtllm/lib/backend.cpp
+++ b/backends/trtllm/lib/backend.cpp
@@ -1,3 +1,4 @@
+#include <cstdlib>
 #include <fstream>
 
 #include <fmt/ranges.h>
@@ -7,11 +8,33 @@
 #include "backend.h"
 #include "hardware.h"
 
+
+void huggingface::tgi::backends::InitializeLogging() {
+#ifdef NDEBUG
+    if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) {
+        std::string log_level(TRTLLM_LOG_LEVEL_CSTR);
+        std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) {
+            return std::tolower(c);
+        });
+
+        if (log_level == "debug")
+            spdlog::set_level(spdlog::level::debug);
+        else
+            spdlog::set_level(spdlog::level::info);
+    }
+#else
+    spdlog::set_level(spdlog::level::debug);
+#endif
+}
+
 void huggingface::tgi::backends::InitializeBackend() {
     SPDLOG_INFO("Initializing Backend...");
     nvmlInit_v2();
     initTrtLlmPlugins();
 
+    InitializeLogging();
+
+    SPDLOG_INFO("Backend Executor Version: {}", tle::version());
     const auto numGpus = huggingface::hardware::cuda::GetNumDevices();
     if (numGpus.has_value()) {
         SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value());
@@ -20,47 +43,49 @@ void huggingface::tgi::backends::InitializeBackend() {
     }
 }
 
+[[nodiscard]]
+tle::ParallelConfig
+huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept {
+    auto mode = tle::CommunicationMode::kLEADER;
+    std::optional<tle::OrchestratorConfig> orchestratorConfig = std::nullopt;
+
+    if (worldSize > 1) {
+        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
+        mode = tle::CommunicationMode::kORCHESTRATOR;
+        orchestratorConfig = std::make_optional<tle::OrchestratorConfig>(true, workerPath, nullptr, true);
+    } else {
+        SPDLOG_INFO("Detected single engine deployment, using leader mode");
+    }
+
+    return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig);
+}
+
 [[nodiscard]]
 tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) {
-    tle::ExecutorConfig execConfig(1);
+    tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1);
 
     // Retrieve the compute capabilities to enable some options at runtime
     const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities();
 
     // Single engine (TP = PP = 1) -> using leader mode (no MPI involved)
-    if (config["/pretrained_config/mapping/world_size"_json_pointer].get<uint8_t>() == 1) {
-        SPDLOG_INFO("Detected single engine deployment, using leader mode");
-        execConfig.setParallelConfig(tle::ParallelConfig(
-                tle::CommunicationType::kMPI,
-                tle::CommunicationMode::kLEADER,
-                std::nullopt,
-                std::nullopt,
-                std::nullopt
-        ));
-    } else { // Multiple engines -> using orchestrator mode (MPI involved)
-        SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode");
-        execConfig.setParallelConfig(tle::ParallelConfig(
-                tle::CommunicationType::kMPI,
-                tle::CommunicationMode::kORCHESTRATOR,
-                std::nullopt,
-                std::nullopt,
-                tle::OrchestratorConfig(true, workerPath, nullptr, true)
-        ));
-    }
+    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
+    execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath));
 
     // Define some configuration variables
     execConfig.setKvCacheConfig(tle::KvCacheConfig(true));
-    execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere());
+    execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere());
+    execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION));
     return execConfig;
 }
 
 tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
-        uint32_t topK,
-        float_t topP,
-        float_t temperature,
-        float_t repetition_penalty,
-        float_t frequency_penalty,
-        uint64_t seed) {
+        const uint32_t topK,
+        const float_t topP,
+        const float_t temperature,
+        const float_t repetition_penalty,
+        const float_t frequency_penalty,
+        const uint64_t seed) noexcept {
+
     return tle::SamplingConfig(
             1,  // TGI only use a single beam
             topK,
@@ -78,69 +103,101 @@ tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig(
     );
 }
 
+std::optional<std::list<std::vector<huggingface::tgi::backends::TokenId>>>
+huggingface::tgi::backends::GetStopWordsFromConfig(
+        const std::filesystem::path &generationConfigPath) noexcept {
+    if (exists(generationConfigPath)) {
+        const auto generationConfig = json::parse(std::ifstream(generationConfigPath));
+        if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) {
+            SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size());
+            std::list<std::vector<huggingface::tgi::backends::TokenId>> stopWords(eosTokenIds.size());
+
+            const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type {
+                return {tokenIdObj.template get<tle::TokenIdType>()};
+            };
+
+            std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token);
+            return stopWords;
+        } else {
+            SPDLOG_INFO("Invalid EOS tokens entry found (not an array)");
+        }
+    } else {
+        SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist");
+    }
+
+    return std::nullopt;
+}
+
 huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend(
         const std::filesystem::path &enginesFolder,
         const std::filesystem::path &executorWorker
 ) :
         config(json::parse(std::ifstream(enginesFolder / "config.json"))),
-        executor(
-                enginesFolder,
-                tensorrt_llm::executor::ModelType::kDECODER_ONLY,
-                GetExecutorConfig(config, executorWorker.string()
-                )) {
-    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref<const std::string &>());
-}
+        executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY,
+                 GetExecutorConfig(config, executorWorker.string())) {
+
+    SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get<std::string_view>());
+
+    // Ensure we have enough GPUs on the system
+    const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get<size_t>();
+    const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0);
+    if (numGpus < worldSize) {
+        SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize);
+        // todo : raise exception to catch on rust side
+    }
+
+    // Cache variables
+    maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<uint32_t>();
 
-bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const {
-    return executor.canEnqueueRequests();
+    // Attempt to discover stopWords from the generation_config.json
+    const auto generationConfigPath = enginesFolder / "generation_config.json";
+    stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list<std::vector<TokenId>>());
 }
 
 [[nodiscard("Returned number of requests needs to be consumed")]]
 size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const {
+#ifdef NDEBUG
     return executor.getNumResponsesReady();
+#else
+    const auto numResponses = executor.getNumResponsesReady();
+    if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses);
+    return numResponses;
+#endif
 }
 
 [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]]
 tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit(
         const std::vector<tle::TokenIdType> &tokens,
+        const uint32_t maxNewTokens,
         const int32_t topK,
         const float_t topP,
         const float_t temperature,
-        const float_t repetition_penalty,
-        const float_t frequency_penalty,
+        const float_t repetitionPenalty,
+        const float_t frequencyPenalty,
         const uint64_t seed
 ) {
-#ifdef NDEBUG
-    SPDLOG_DEBUG(
-            FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"),
-            tokens.size(),
-            executor.getLatestIterationStats().back().numActiveRequests
-    );
-#else
-    SPDLOG_DEBUG(
-            FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"),
-            fmt::join(tokens, ", "),
-            executor.getLatestIterationStats().front().numActiveRequests
-    );
+    const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast<uint32_t>(maxNumTokens - tokens.size()));
+#ifndef NDEBUG
+    {
+        const auto &iterations = executor.getLatestIterationStats();
+        const auto &lastIteration = iterations.front();
+
+        SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests);
+        SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
+        SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked);
+    }
 #endif
 
-    const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get<size_t>();
-    const auto maxNewTokens = static_cast<int32_t>(std::max(1ul, maxNumTokens - tokens.size()));
+    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed);
 
-    const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
-    const auto output = tle::OutputConfig(true, false, false, true, false);
-    return executor.enqueueRequest(
-            tle::Request{tokens, maxNewTokens, true, sampling, output});
-}
+    // Build the request
+    auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG};
+    request.setStopWords(stopWords);
 
-[[nodiscard("Generated tokens result must be used")]]
-std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) {
-    SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId);
-    return executor.awaitResponses(requestId);
+    // Submit to the executor for batching
+    return executor.enqueueRequest(request);
 }
 
-
-void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() {
-    SPDLOG_INFO("Shutting down executor");
-    executor.shutdown();
+std::vector<tle::Response> huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() {
+    return executor.awaitResponses();
 }
diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh
index e0e2dd17b45..4c2dc26b6bf 100755
--- a/backends/trtllm/scripts/install_tensorrt.sh
+++ b/backends/trtllm/scripts/install_tensorrt.sh
@@ -2,12 +2,13 @@
 
 set -ex
 
-TRT_VER="10.2.0.19"
-CUDA_VER="12.5"
-CUDNN_VER="9.2.1.18-1"
-NCCL_VER="2.22.3-1+cuda12.5"
-CUBLAS_VER="12.5.3.2-1"
-NVRTC_VER="12.5.82-1"
+TRT_VER_BASE="10.4.0"
+TRT_VER_FULL="${TRT_VER_BASE}.26"
+CUDA_VER="12.6"
+CUDNN_VER="9.5.0.50-1"
+NCCL_VER="2.22.3-1+cuda12.6"
+CUBLAS_VER="12.6.3.3-1"
+NVRTC_VER="12.6.77-1"
 
 for i in "$@"; do
     case $i in
@@ -32,8 +33,9 @@ install_ubuntu_requirements() {
     ARCH=$(uname -m)
     if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
     if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
-    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb
-    dpkg -i cuda-keyring_1.0-1_all.deb
+    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb
+    dpkg -i cuda-keyring_1.1-1_all.deb
+    rm /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list
 
     apt-get update
     if [[ $(apt list --installed | grep libcudnn9) ]]; then
@@ -71,7 +73,7 @@ install_centos_requirements() {
 install_tensorrt() {
     #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
     #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
-    TRT_CUDA_VERSION="12.5"
+    TRT_CUDA_VERSION="12.6"
 
     if [ -z "$RELEASE_URL_TRT" ];then
         ARCH=${TRT_TARGETARCH}
@@ -79,12 +81,12 @@ install_tensorrt() {
         if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
         if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
         if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
-        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
-        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
+        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_BASE}/tars/TensorRT-${TRT_VER_FULL}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
     fi
     wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
     tar -xf /tmp/TensorRT.tar -C /usr/local/
-    mv /usr/local/TensorRT-${TRT_VER} /usr/local/tensorrt
+    mv /usr/local/TensorRT-${TRT_VER_FULL} /usr/local/tensorrt
     # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
     rm -rf /tmp/TensorRT.tar
 }
diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs
deleted file mode 100644
index b23aa6c01fe..00000000000
--- a/backends/trtllm/src/backend.rs
+++ /dev/null
@@ -1,330 +0,0 @@
-use std::future::Future;
-use std::path::Path;
-use std::pin::{pin, Pin};
-use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Arc, OnceLock};
-use std::task::{Context, Poll};
-use std::time::Duration;
-
-use async_trait::async_trait;
-use cxx::UniquePtr;
-use log::{error, warn};
-use tokenizers::Tokenizer;
-use tokio::sync::mpsc::{unbounded_channel, UnboundedSender};
-use tokio::time::{sleep, Instant};
-use tokio_stream::wrappers::UnboundedReceiverStream;
-use tokio_stream::{Stream, StreamExt};
-use tracing::{instrument, span, Level};
-
-// use tokio::sync::RwLock;
-use parking_lot::RwLock;
-use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
-use text_generation_router::validation::ValidationError::UnsupportedModality;
-use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError};
-use text_generation_router::{FinishReason, Token};
-
-use crate::errors::TensorRtLlmBackendError;
-use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
-
-// Value used to poll the state of the generation stream
-static POLLING_INTERVAL_US: OnceLock<u64> = OnceLock::new();
-
-type InferResult<T> = Result<T, InferError>;
-
-pub(crate) struct Generation {
-    executor: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
-    done: Arc<AtomicBool>,
-}
-
-/// Holds the user provided input to be executed along with a channel allowing
-/// to bubble up all the generated tokens for that tokens the to end stream.
-pub struct GenerationContext {
-    sender: UnboundedSender<InferResult<InferStreamResponse>>,
-    tokenizer: Arc<Tokenizer>,
-    tokens: Vec<u32>,
-    done: Arc<AtomicBool>,
-    queued: Instant,
-    start: Option<Instant>,
-}
-
-impl Stream for Generation {
-    type Item = usize;
-
-    fn poll_next(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        let interval = POLLING_INTERVAL_US.get_or_init(|| {
-            u64::from_str(option_env!("TRTLLM_BACKEND_POLLING_INTERVAL_US").unwrap_or("100"))
-                .expect("Invalid value provided for envvar POLLING_INTERVAL_US")
-        });
-
-        if !self.done.load(Ordering::Relaxed) {
-            let backend = pin!(self.executor.read());
-            let status = match backend.poll(ctx) {
-                Poll::Ready(executor_r) => {
-                    let ready = executor_r.num_responses_ready();
-                    if ready == 0 {
-                        Poll::Pending
-                    } else {
-                        Poll::Ready(Some(ready))
-                    }
-                }
-                Poll::Pending => Poll::Pending,
-            };
-
-            let waker = ctx.waker().clone();
-            tokio::spawn(async {
-                sleep(Duration::from_micros(*interval)).await;
-                waker.wake();
-            });
-
-            status
-        } else {
-            Poll::Ready(None) // end of stream
-        }
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        (1, None)
-    }
-}
-
-unsafe impl Send for TensorRtLlmBackendImpl {}
-unsafe impl Sync for TensorRtLlmBackendImpl {}
-
-/// Implements the logic to execute generation with TensorRT-LLM executor API in background
-pub struct TensorRtLlmBackend {
-    tokenizer: Arc<Tokenizer>,
-
-    // Backing the backend behind a RwLock to allow concurrent read access to retrieve
-    // the number of available tokens (read only) in the Generation stream
-    backend: Arc<RwLock<UniquePtr<TensorRtLlmBackendImpl>>>,
-}
-
-impl TensorRtLlmBackend {
-    pub fn new<P: AsRef<Path> + Send + 'static, PP: AsRef<Path> + Send + 'static>(
-        tokenizer: Tokenizer,
-        engine_folder: P,
-        executor_worker_path: PP,
-    ) -> Result<Self, TensorRtLlmBackendError> {
-        Ok(TensorRtLlmBackend {
-            tokenizer: Arc::new(tokenizer),
-            backend: Arc::new(RwLock::new(create_tensorrt_llm_backend(
-                engine_folder.as_ref().to_str().unwrap(),
-                executor_worker_path.as_ref().to_str().unwrap(),
-            ))),
-        })
-    }
-
-    fn validate(request: &ValidGenerateRequest) -> InferResult<&String> {
-        if request.top_n_tokens > 1 {
-            return Err(InferError::ValidationError(
-                ValidationError::TopNTokensDisabled,
-            ));
-        }
-
-        // TODO: Is it really needed? How can it be validated before?
-        if request.parameters.grammar.is_some() {
-            return Err(InferError::ValidationError(ValidationError::Grammar));
-        }
-
-        match request.inputs.len() {
-            0 => Err(InferError::ValidationError(ValidationError::EmptyInput)),
-            2.. => Err(InferError::GenerationError(
-                "TensorRT-LLM backend don't support multi-chunk".into(),
-            )),
-            1 => match request.inputs.first().expect("Single item-chunk") {
-                Chunk::Text(text) => Ok(text),
-                Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))),
-            },
-        }
-    }
-
-    fn generate(
-        &self,
-        sender: UnboundedSender<InferResult<InferStreamResponse>>,
-        tokens: Vec<u32>,
-        top_k: u32,
-        top_p: f32,
-        temperature: f32,
-        repetition_penalty: f32,
-        frequency_penalty: f32,
-        seed: u64,
-    ) {
-        let tokenizer = Arc::clone(&self.tokenizer);
-        let executor = Arc::clone(&self.backend);
-
-        // Let's push this in async context
-        tokio::spawn(async move {
-            // Define the generation state
-            let mut generation = Generation {
-                executor: executor.clone(),
-                done: Arc::new(AtomicBool::new(false)),
-            };
-
-            // Define the context over the generation
-            // TODO(asap): Do we really need so many shared-ownership?
-            let ctx = Box::new(GenerationContext {
-                sender: sender.clone(),
-                tokenizer,
-                tokens: vec![],
-                done: Arc::clone(&generation.done),
-                start: None,
-                queued: Instant::now(),
-            });
-
-            // We are leaking the context on-purpose to avoid the box being dropped while there are
-            // still computation ongoing
-            // TODO(asap): Can we achieve the same with an Arc<Box<T>> without the need to go unsafe?
-            let ctx_ = Box::leak(ctx);
-
-            // Submit the request to the batcher
-            let request_id = span!(Level::DEBUG, "submit")
-                .in_scope(|| async {
-                    let mut handle = executor.write().await;
-                    let request_id = handle.pin_mut().submit(
-                        &tokens,
-                        top_k as i32,
-                        top_p,
-                        temperature,
-                        repetition_penalty,
-                        frequency_penalty,
-                        seed,
-                    );
-
-                    request_id
-                })
-                .await;
-
-            while let Some(_) = generation.next().await {
-                let mut executor_w = executor.write().await;
-                let executor = executor_w.pin_mut();
-
-                span!(Level::DEBUG, "decode")
-                    .in_scope(|| async {
-                        unsafe {
-                            executor.stream_tokens(
-                                request_id,
-                                ctx_,
-                                |ctx: *mut GenerationContext, step: GenerationStep| {
-                                    let inner_ctx = &mut *ctx;
-
-                                    // Update the timestamp at which the request started effectively
-                                    // Can be a bit off, would need to be before the callback, let's see
-                                    inner_ctx.start.get_or_insert(Instant::now());
-                                    inner_ctx.done.store(step.is_final, Ordering::Relaxed);
-
-                                    // Ensure we are not running into errors
-                                    let parcel = if !step.has_error {
-                                        // Insert the latest generated token to the tracker
-                                        inner_ctx.tokens.push(step.token_id);
-
-                                        // Decode the token
-                                        let text = inner_ctx
-                                            .tokenizer
-                                            .decode(&[step.token_id], true)
-                                            .expect("Failed to decode token");
-
-                                        let special = inner_ctx
-                                            .tokenizer
-                                            .get_added_vocabulary()
-                                            .is_special_token(&text);
-
-                                        // Create the structure holding the token
-                                        let token = Token {
-                                            id: step.token_id,
-                                            text,
-                                            logprob: step.log_prob,
-                                            special,
-                                        };
-
-                                        if step.is_final {
-                                            let generated_text = inner_ctx
-                                                .tokenizer
-                                                .decode(&inner_ctx.tokens, true)
-                                                .expect("Failed to decode generated_tokens");
-
-                                            Ok(InferStreamResponse::End {
-                                                token,
-                                                top_tokens: vec![],
-                                                generated_text: GeneratedText {
-                                                    text: generated_text,
-                                                    generated_tokens: inner_ctx.tokens.len() as u32,
-                                                    finish_reason: FinishReason::EndOfSequenceToken,
-                                                    seed: None,
-                                                },
-                                                start: inner_ctx.start.unwrap_or(Instant::now()),
-                                                queued: inner_ctx.queued,
-                                            })
-                                        } else {
-                                            Ok(InferStreamResponse::Intermediate {
-                                                token,
-                                                top_tokens: vec![],
-                                            })
-                                        }
-                                    } else {
-                                        error!("Error caught while decoding: {}", &step.error_msg);
-                                        Err(InferError::GenerationError(step.error_msg))
-                                    };
-
-                                    // Send the parcel to the client
-                                    inner_ctx
-                                        .sender
-                                        .send(parcel)
-                                        .expect("Failed to sent msg through the channel");
-                                },
-                            );
-                        }
-                    })
-                    .await;
-            }
-
-            // "Properly" free the shared context...
-            // TODO: clean that piece of sh** asap
-            unsafe {
-                let _ = Box::from_raw(ctx_);
-            }
-        });
-    }
-}
-
-#[async_trait]
-impl Backend for TensorRtLlmBackend {
-    #[instrument(skip_all)]
-    fn schedule(
-        &self,
-        request: ValidGenerateRequest,
-    ) -> InferResult<UnboundedReceiverStream<InferResult<InferStreamResponse>>> {
-        // Let's add a few more validation
-        let input = TensorRtLlmBackend::validate(&request)?;
-
-        // Channel to stream the generated token as they come from the worker thread back to the transport layer
-        let (sender, receiver) = unbounded_channel();
-
-        // Unpack parameters
-        let params = &request.parameters;
-
-        // Preprocess the inputs to send to TRTLLM backend
-        let encoding = self
-            .tokenizer
-            .encode(input.as_str(), true)
-            .map_err(|e| InferError::GenerationError(e.to_string()))?;
-
-        // Generate the response
-        self.generate(
-            sender,
-            Vec::from(encoding.get_ids()),
-            params.top_k,
-            params.top_p,
-            params.temperature,
-            params.repetition_penalty,
-            params.frequency_penalty,
-            params.seed,
-        );
-
-        Ok(UnboundedReceiverStream::new(receiver))
-    }
-
-    async fn health(&self, _current_health: bool) -> bool {
-        true
-    }
-}
diff --git a/backends/trtllm/src/errors.rs b/backends/trtllm/src/errors.rs
index a672d2a406b..812fd6e30d8 100644
--- a/backends/trtllm/src/errors.rs
+++ b/backends/trtllm/src/errors.rs
@@ -1,9 +1,16 @@
+use std::path::PathBuf;
 use thiserror::Error;
 
 use text_generation_router::server;
 
 #[derive(Debug, Error)]
 pub enum TensorRtLlmBackendError {
+    #[error("Provided engine folder {0} doesn't exist")]
+    EngineFolderDoesntExists(PathBuf),
+    #[error("Provided executorWorker binary path {0} doesn't exist")]
+    ExecutorWorkerNotFound(PathBuf),
+    #[error("TensorRT-LLM Runtime error: {0}")]
+    Runtime(String),
     #[error("Tokenizer error: {0}")]
     Tokenizer(String),
     #[error("Argument validation error: {0}")]
diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp
index d6317a68c89..0a92c050f65 100644
--- a/backends/trtllm/src/ffi.cpp
+++ b/backends/trtllm/src/ffi.cpp
@@ -3,11 +3,13 @@
 //
 #pragma once
 
-#include <cmath>
+#include <algorithm>
 #include <exception>
 #include <filesystem>
+#include <functional>
 #include <limits>
 #include <iterator>
+#include <ranges>
 #include <vector>
 
 #include <spdlog/spdlog.h>
@@ -20,61 +22,64 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl(
 ) : TensorRtLlmBackend(engineFolder, executorWorker) {}
 
 
-bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const {
-    return TensorRtLlmBackend::IsReady();
-}
-
 uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit(
-        rust::Slice<const uint32_t> tokens, int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty,
-        float_t frequency_penalty, uint64_t seed) {
+        rust::Slice<const uint32_t> tokens,
+        uint32_t maxNewTokens,
+        int32_t topK,
+        float_t topP,
+        float_t temperature,
+        float_t repetition_penalty,
+        float_t frequency_penalty,
+        uint64_t seed) {
 
     // This will copy all the items from the initial slice
-    std::vector<int32_t> tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end()));
+    std::vector<int32_t> tokens_(tokens.begin(), tokens.end());
     return TensorRtLlmBackend::Submit(
-            std::move(tokens_), topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
+            std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed);
 }
 
-size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens(
-        const uint64_t requestId,
-        huggingface::tgi::backends::GenerationContext *ctx,
-        rust::Fn<void(huggingface::tgi::backends::GenerationContext *,
-                      huggingface::tgi::backends::GenerationStep)> callback) {
-
-    size_t numTokens = 0;
-    for (const auto &item: Poll(requestId)) {
-        GenerationStep step;
-        if (!item.hasError()) {
-            SPDLOG_DEBUG("\tStreamTokens -> Decoding token...");
-            const auto decoded = item.getResult();
-
-            const auto token = decoded.outputTokenIds[0][0];
-            const auto isFinal = decoded.isFinal;
-            const auto logProb = decoded.logProbs.value()[0][0];
-
-            ++numTokens;
-
-            SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal);
-            step = huggingface::tgi::backends::GenerationStep{
-                    static_cast<uint32_t>(token), logProb, isFinal, false, std::move(std::string())
+std::unique_ptr<std::vector<huggingface::tgi::backends::GenerationStep>>
+huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() {
+    const auto responses = TensorRtLlmBackend::PullNewTokens();
+
+    auto steps = std::make_unique<std::vector<GenerationStep>>();
+    steps->reserve(responses.size());
+
+#ifndef NDEBUG
+    SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size());
+#endif
+
+    // Transform tle::Response to GenerationStep
+    std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) {
+        const auto reqId = r.getRequestId();
+        if (!r.hasError()) {
+            const auto result = r.getResult();
+            return GenerationStep{
+                    reqId,
+                    static_cast<uint32_t>(result.outputTokenIds[0][0]),
+                    result.logProbs.value()[0][0],
+                    result.isFinal,
+                    false,
+                    std::string()
             };
-            SPDLOG_DEBUG("\tStreamTokens -> Post callback");
         } else {
-            // TODO : Return rest::Result with error
-            const auto what = item.getErrorMsg();
-            SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", what);
-            step = huggingface::tgi::backends::GenerationStep{
-                    std::numeric_limits<uint32_t>::max(), 0.0, true, true, std::move(what)
+            return GenerationStep{
+                    reqId,
+                    0,
+                    0.0,
+                    true,
+                    true,
+                    std::move(r.getErrorMsg())
             };
         }
+    });
 
-        callback(std::move(ctx), std::move(step));
-    }
-
-    return numTokens;
+    return steps;
 }
 
 std::unique_ptr<huggingface::tgi::backends::TensorRtLlmBackendImpl>
 huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) {
+    SPDLOG_INFO("Creating TensorRT-LLM Backend");
     // Unconditionally call this to initialize and discover TRTLLM plugins
     InitializeBackend();
 
diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs
index 1a804f88973..edd8caff154 100644
--- a/backends/trtllm/src/lib.rs
+++ b/backends/trtllm/src/lib.rs
@@ -1,14 +1,16 @@
-pub use backend::{GenerationContext, TensorRtLlmBackend};
+pub use looper::TensorRtLlmBackendV2;
 
-mod backend;
 pub mod errors;
+mod looper;
+mod utils;
 
 #[cxx::bridge(namespace = "huggingface::tgi::backends")]
 mod ffi {
-
     /// Struct used as shared type between rust and C++ to represent the result
     /// of a single decoding iteration
+    #[derive(Debug, Clone)]
     pub struct GenerationStep {
+        request_id: u64,
         token_id: u32,
         log_prob: f32,
         is_final: bool,
@@ -16,10 +18,6 @@ mod ffi {
         error_msg: String,
     }
 
-    extern "Rust" {
-        type GenerationContext;
-    }
-
     unsafe extern "C++" {
         include!("backends/trtllm/src/ffi.cpp");
 
@@ -44,10 +42,7 @@ mod ffi {
         fn CreateTensorRtLlmBackend(
             engine_folder: &str,
             executor_worker: &str,
-        ) -> UniquePtr<TensorRtLlmBackendImpl>;
-
-        // #[rust_name = "is_ready"]
-        // fn IsReady(self: &TensorRtLlmBackendImpl) -> bool;
+        ) -> Result<UniquePtr<TensorRtLlmBackendImpl>>;
 
         #[rust_name = "num_responses_ready"]
         fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize;
@@ -56,23 +51,18 @@ mod ffi {
         fn Submit(
             self: Pin<&mut TensorRtLlmBackendImpl>,
             tokens: &[u32],
+            max_new_tokens: u32,
             top_k: i32,
             top_p: f32,
             temperature: f32,
             repetition_penalty: f32,
             frequency_penalty: f32,
             seed: u64,
-        ) -> u64;
+        ) -> Result<u64>;
 
-        #[rust_name = "stream_tokens"]
-        unsafe fn StreamTokens(
+        #[rust_name = "pull_tokens"]
+        fn PullTokens(
             self: Pin<&mut TensorRtLlmBackendImpl>,
-            request_id: u64,
-            ctx: *mut GenerationContext,
-            cb: unsafe fn(*mut GenerationContext, GenerationStep),
-        ) -> usize;
-
-        // #[rust_name = "shutdown"]
-        // fn Shutdown(self: Pin<&mut TensorRtLlmBackendImpl>);
+        ) -> Result<UniquePtr<CxxVector<GenerationStep>>>;
     }
 }
diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs
new file mode 100644
index 00000000000..e26155c163c
--- /dev/null
+++ b/backends/trtllm/src/looper.rs
@@ -0,0 +1,382 @@
+use std::hint;
+use std::ops::Deref;
+use std::path::Path;
+
+use async_trait::async_trait;
+use cxx::UniquePtr;
+use hashbrown::HashMap;
+use tokenizers::Tokenizer;
+use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender};
+use tokio::sync::TryAcquireError;
+use tokio::task::{spawn_blocking, JoinHandle};
+use tokio::time::Instant;
+use tokio_stream::wrappers::UnboundedReceiverStream;
+use tracing::{debug, error, warn};
+
+use text_generation_router::infer::InferError::{GenerationError, ValidationError};
+use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
+use text_generation_router::validation::ValidationError::{
+    EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality,
+};
+use text_generation_router::validation::{Chunk, ValidGenerateRequest};
+use text_generation_router::{FinishReason, Token};
+
+use crate::errors::TensorRtLlmBackendError;
+use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl};
+use crate::utils::first_line;
+
+type InferResult<T> = Result<T, InferError>;
+
+/// Wrap the requests along with the channel used to stream back to the client the decoded tokens
+struct GenerationContext {
+    request: ValidGenerateRequest,
+    start: Option<Instant>,
+    queued: Instant,
+    streamer: UnboundedSender<InferResult<InferStreamResponse>>,
+}
+
+#[derive(Debug, Copy, Clone)]
+struct DecodedToken {
+    id: u32,
+    log_prob: f32,
+    is_final: bool,
+}
+
+impl<'step> TryFrom<&'step GenerationStep> for DecodedToken {
+    type Error = InferError;
+
+    fn try_from(step: &'step GenerationStep) -> Result<Self, Self::Error> {
+        if !step.has_error {
+            Ok(Self {
+                id: step.token_id,
+                log_prob: step.log_prob,
+                is_final: step.is_final,
+            })
+        } else {
+            Err(GenerationError(step.error_msg.clone()))
+        }
+    }
+}
+
+/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens
+struct DecodedTokenContext {
+    token: DecodedToken,
+    start: Option<Instant>,
+    queued: Instant,
+    channel: UnboundedSender<InferResult<InferStreamResponse>>,
+}
+
+fn executor_status_looper(
+    mut backend: UniquePtr<TensorRtLlmBackendImpl>,
+    max_inflight_requests: usize,
+    mut waiting_requests: UnboundedReceiver<GenerationContext>,
+    post_processor_sender: UnboundedSender<(u64, InferResult<DecodedTokenContext>)>,
+) {
+    // Track the tuple (request_id, stream) for each request
+    let mut in_flights =
+        HashMap::<u64, GenerationContext>::with_capacity(max_inflight_requests * 2);
+
+    // TODO: Does it need a spin-loop?
+    'scheduler: loop {
+        // Is there any request pending to be scheduled?
+        let awaiting_requests = waiting_requests.len();
+        for _ in 0..awaiting_requests {
+            // Retrieve all the requests
+            if let Some(mut ctx) = waiting_requests.blocking_recv() {
+                // Submit all the request to the executor and move the context to the in-flight tracker
+                let request = &ctx.request;
+                let generation_params = &request.parameters;
+                let stopping_params = &request.stopping_parameters;
+                let input_ids = request.input_ids.as_deref();
+
+                // Submit to the TensorRT-LLM executor for scheduling
+                match backend.pin_mut().submit(
+                    &input_ids.unwrap(), // This is checked beforehand in validate()
+                    stopping_params.max_new_tokens,
+                    generation_params.top_k as i32,
+                    generation_params.top_p,
+                    generation_params.temperature,
+                    generation_params.repetition_penalty,
+                    generation_params.frequency_penalty,
+                    generation_params.seed,
+                ) {
+                    Ok(request_id) => {
+                        // Insert the context linked to the generated request id in the tracker
+                        debug!("[in-flight] Added {}", request_id);
+                        ctx.start = Some(Instant::now());
+                        in_flights.insert(request_id, ctx);
+                    }
+                    Err(e) => {
+                        // Return to the caller
+                        let what = e.to_string();
+                        error!(error = what.as_str(), "Failed to schedule request");
+
+                        let err = Err(InferError::Overloaded(TryAcquireError::NoPermits));
+                        if let Err(_) = ctx.streamer.send(err) {
+                            error!("Failed to send back error to the client");
+                        }
+                    }
+                };
+            }
+        }
+
+        if backend.num_responses_ready() > 0 {
+            match backend.pin_mut().pull_tokens() {
+                Ok(responses) => {
+                    // Iterate through all the decoded token
+                    for step in responses.deref() {
+                        if let Some(ctx) = in_flights.get(&step.request_id) {
+                            // Remove from tracked requests
+                            let parcel =
+                                DecodedToken::try_from(step).map(|dt| DecodedTokenContext {
+                                    token: dt,
+                                    start: ctx.start,
+                                    queued: ctx.queued,
+                                    channel: ctx.streamer.clone(),
+                                });
+
+                            // Submit the work to p:the post_processor
+                            let posted = post_processor_sender.send((step.request_id, parcel));
+
+                            if posted.is_err() || step.is_final {
+                                debug!("Removing {}", step.request_id);
+                                let _ = in_flights.remove(&step.request_id);
+                            }
+                        } else {
+                            warn!("Untracked request {}", step.request_id,);
+                        }
+                    }
+                }
+                Err(ref err) => {
+                    error!("Failed to get responses from the executor: {}.", err.what());
+                    break 'scheduler;
+                }
+            }
+        }
+
+        // Hint the CPU we are spin-locking
+        hint::spin_loop();
+    }
+}
+
+fn post_processor_looper<const MAX_NUM_TOKENS: usize>(
+    tokenizer: Tokenizer,
+    max_inflight_requests: usize,
+    mut decoded_tokens: UnboundedReceiver<(u64, InferResult<DecodedTokenContext>)>,
+) {
+    let mut states: HashMap<u64, Vec<u32>> = HashMap::with_capacity(max_inflight_requests * 2);
+
+    'post_processor: loop {
+        if decoded_tokens.is_closed() {
+            warn!("Post processor IPC is closed, loop will exit now.");
+            break 'post_processor;
+        }
+
+        if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() {
+            match decoded {
+                Ok(ctx) => {
+                    states
+                        .entry(request_id)
+                        .and_modify(|s| s.push(*&ctx.token.id))
+                        .or_insert_with(|| {
+                            let mut state = Vec::with_capacity(MAX_NUM_TOKENS);
+                            state.push(*&ctx.token.id);
+                            state
+                        });
+
+                    let out = match tokenizer.decode(&[ctx.token.id], false) {
+                        Ok(text) => {
+                            let is_special =
+                                tokenizer.get_added_vocabulary().is_special_token(&text);
+                            let token = Token {
+                                id: ctx.token.id,
+                                text,
+                                logprob: ctx.token.log_prob,
+                                special: is_special,
+                            };
+
+                            let out = if !ctx.token.is_final {
+                                InferStreamResponse::Intermediate {
+                                    token,
+                                    top_tokens: vec![],
+                                }
+                            } else {
+                                let tokens = states.remove(&request_id).unwrap();
+                                let text = tokenizer.decode(&tokens, true);
+                                let generated_text = GeneratedText {
+                                    text: text.unwrap(),
+                                    generated_tokens: tokens.len() as u32,
+                                    finish_reason: FinishReason::EndOfSequenceToken,
+                                    seed: None,
+                                };
+
+                                InferStreamResponse::End {
+                                    token,
+                                    top_tokens: vec![],
+                                    generated_text,
+                                    start: ctx.start.unwrap(),
+                                    queued: ctx.queued,
+                                }
+                            };
+
+                            Ok(out)
+                        }
+                        Err(err) => Err(GenerationError(err.to_string())),
+                    };
+
+                    if let Err(_) = ctx.channel.send(out) {
+                        warn!("Failed to send decoded token back to the user")
+                    }
+                }
+                Err(_err) => {
+                    todo!("what do we do?")
+                }
+            }
+        }
+    }
+}
+
+fn ensure_paths_exist<P: AsRef<Path>, PP: AsRef<Path>>(
+    engine_folder: P,
+    executor_worker_path: PP,
+) -> Result<(String, String), TensorRtLlmBackendError> {
+    // Retrieve paths as &str for the backend creation
+    let engine_folder = engine_folder.as_ref();
+    let executor_worker_path = executor_worker_path.as_ref();
+
+    // Ensure the engine folder exists
+    if !engine_folder.exists() {
+        let err = TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf());
+
+        error!("Path validation failed: {}", err,);
+        return Err(err);
+    }
+
+    // Ensure executor worker binary exists
+    if !executor_worker_path.exists() {
+        let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf());
+
+        error!("Path validation failed: {}", err,);
+        return Err(err);
+    }
+
+    let engine_folder = String::from(
+        engine_folder
+            .to_str()
+            .expect("Failed to convert engine_folder to valid UTF-8"),
+    );
+
+    let executor_worker_path = String::from(
+        executor_worker_path
+            .to_str()
+            .expect("Failed to convert executor_worker_path to valid UTF-8"),
+    );
+
+    Ok((engine_folder, executor_worker_path))
+}
+
+unsafe impl Send for TensorRtLlmBackendImpl {}
+
+pub struct TensorRtLlmBackendV2 {
+    executor_looper: JoinHandle<()>,
+    post_processor_looper: JoinHandle<()>,
+    executor: UnboundedSender<GenerationContext>,
+}
+
+impl TensorRtLlmBackendV2 {
+    pub fn new<P: AsRef<Path> + Send, PP: AsRef<Path> + Send>(
+        tokenizer: Tokenizer,
+        engine_folder: P,
+        executor_worker_path: PP,
+        max_inflight_requests: usize,
+    ) -> Result<Self, TensorRtLlmBackendError> {
+        let (engine_folder, executor_worker_path) =
+            ensure_paths_exist(engine_folder, executor_worker_path)?;
+
+        // Allocate the IPC layer to communicate with the backend
+        let (executor_sender, executor_receiver) = unbounded_channel();
+        let (post_processor_sender, post_processor_receiver) = unbounded_channel();
+
+        // Create the FFI backend
+        let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path)
+            .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?;
+
+        // Executor looper is responsible for scheduling and pulling requests state at regular interval
+        let executor_looper = spawn_blocking(move || {
+            executor_status_looper(
+                backend,
+                max_inflight_requests,
+                executor_receiver,
+                post_processor_sender,
+            )
+        });
+
+        // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user
+        let post_processor_looper = spawn_blocking(move || {
+            post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver)
+        });
+
+        Ok(TensorRtLlmBackendV2 {
+            executor_looper,
+            post_processor_looper,
+            executor: executor_sender,
+        })
+    }
+
+    fn validate(request: &ValidGenerateRequest) -> InferResult<()> {
+        if request.input_ids.is_none() {
+            return Err(ValidationError(UnsupportedModality("No token provided")));
+        }
+
+        if request.top_n_tokens > 1 {
+            return Err(ValidationError(TopNTokensDisabled));
+        }
+
+        // TODO: Is it really needed? How can it be validated before?
+        if request.parameters.grammar.is_some() {
+            return Err(ValidationError(Grammar));
+        }
+
+        match request.inputs.len() {
+            0 => Err(ValidationError(EmptyInput)),
+            2.. => Err(GenerationError(
+                "TensorRT-LLM backend don't support multi-chunk".into(),
+            )),
+            1 => match request.inputs.first().expect("Single item-chunk") {
+                Chunk::Text(_) => Ok(()),
+                Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))),
+            },
+        }
+    }
+}
+
+#[async_trait]
+impl Backend for TensorRtLlmBackendV2 {
+    fn schedule(
+        &self,
+        inner: ValidGenerateRequest,
+    ) -> Result<UnboundedReceiverStream<Result<InferStreamResponse, InferError>>, InferError> {
+        Self::validate(&inner)?;
+
+        // Open-up the stream to send tokens
+        let (streamer, receiver) = unbounded_channel::<InferResult<InferStreamResponse>>();
+
+        // Send the context to the executor for scheduling
+        let queued = Instant::now();
+        match self.executor.send(GenerationContext {
+            request: inner,
+            start: None,
+            queued,
+            streamer,
+        }) {
+            Ok(_) => Ok(UnboundedReceiverStream::new(receiver)),
+            Err(_) => Err(GenerationError(
+                "Failed to submit request to the backend".into(),
+            )),
+        }
+    }
+
+    async fn health(&self, _: bool) -> bool {
+        !self.executor_looper.is_finished() & !self.post_processor_looper.is_finished()
+    }
+}
diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs
index e0ba46c7955..6a247fc1d52 100644
--- a/backends/trtllm/src/main.rs
+++ b/backends/trtllm/src/main.rs
@@ -1,10 +1,16 @@
+use std::path::{Path, PathBuf};
+
 use clap::Parser;
-use std::collections::HashMap;
-use std::path::PathBuf;
+use hf_hub::api::tokio::{Api, ApiBuilder};
+use hf_hub::{Cache, Repo, RepoType};
+use tokenizers::Tokenizer;
+use tracing::info;
+
 use text_generation_backends_trtllm::errors::TensorRtLlmBackendError;
-use text_generation_backends_trtllm::TensorRtLlmBackend;
-use text_generation_router::server;
-use tokenizers::{FromPretrainedParameters, Tokenizer};
+use text_generation_backends_trtllm::TensorRtLlmBackendV2;
+use text_generation_router::server::get_base_tokenizer;
+use text_generation_router::usage_stats::UsageStatsLevel;
+use text_generation_router::{server, HubTokenizerConfig};
 
 /// App Configuration
 #[derive(Parser, Debug)]
@@ -48,14 +54,138 @@ struct Args {
     otlp_service_name: String,
     #[clap(long, env)]
     cors_allow_origin: Option<Vec<String>>,
-    #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
     #[clap(long, env)]
     auth_token: Option<String>,
     #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")]
     executor_worker: PathBuf,
+    #[clap(default_value = "on", long, env)]
+    usage_stats: usage_stats::UsageStatsLevel,
+}
+
+async fn get_tokenizer(
+    tokenizer_name: &str,
+    tokenizer_config_path: Option<&str>,
+    revision: Option<&str>,
+) -> Option<Tokenizer> {
+    // Parse Huggingface hub token
+    let authorization_token = std::env::var("HF_TOKEN")
+        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
+        .ok();
+
+    // Tokenizer instance
+    let local_path = Path::new(tokenizer_name);
+
+    // Shared API builder initialization
+    let api_builder = || {
+        let mut builder = ApiBuilder::new()
+            .with_progress(false)
+            .with_token(authorization_token);
+
+        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
+            builder = builder.with_cache_dir(cache_dir.into());
+        }
+
+        builder
+    };
+
+    // Decide if we need to use the API based on the revision and local path
+    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
+
+    // Initialize API if needed
+    #[derive(Clone)]
+    enum Type {
+        Api(Api),
+        Cache(Cache),
+        None,
+    }
+    let api = if use_api {
+        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
+            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
+                .map_err(|_| ())
+                .map(|cache_dir| Cache::new(cache_dir.into()))
+                .unwrap_or_else(|_| Cache::default());
+            tracing::warn!("Offline mode active using cache defaults");
+            Type::Cache(cache)
+        } else {
+            tracing::info!("Using the Hugging Face API");
+            match api_builder().build() {
+                Ok(api) => Type::Api(api),
+                Err(_) => {
+                    tracing::warn!("Unable to build the Hugging Face API");
+                    Type::None
+                }
+            }
+        }
+    } else {
+        Type::None
+    };
+
+    // Load tokenizer and model info
+    let (
+        tokenizer_filename,
+        _config_filename,
+        tokenizer_config_filename,
+        _preprocessor_config_filename,
+        _processor_config_filename,
+    ) = match api {
+        Type::None => (
+            Some(local_path.join("tokenizer.json")),
+            Some(local_path.join("config.json")),
+            Some(local_path.join("tokenizer_config.json")),
+            Some(local_path.join("preprocessor_config.json")),
+            Some(local_path.join("processor_config.json")),
+        ),
+        Type::Api(api) => {
+            let api_repo = api.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.unwrap_or_else(|| "main").to_string(),
+            ));
+
+            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
+                Ok(tokenizer_filename) => Some(tokenizer_filename),
+                Err(_) => get_base_tokenizer(&api, &api_repo).await,
+            };
+            let config_filename = api_repo.get("config.json").await.ok();
+            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
+            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
+            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
+
+            (
+                tokenizer_filename,
+                config_filename,
+                tokenizer_config_filename,
+                preprocessor_config_filename,
+                processor_config_filename,
+            )
+        }
+        Type::Cache(cache) => {
+            let repo = cache.repo(Repo::with_revision(
+                tokenizer_name.to_string(),
+                RepoType::Model,
+                revision.clone().unwrap_or_else(|| "main").to_string(),
+            ));
+            (
+                repo.get("tokenizer.json"),
+                repo.get("config.json"),
+                repo.get("tokenizer_config.json"),
+                repo.get("preprocessor_config.json"),
+                repo.get("processor_config.json"),
+            )
+        }
+    };
+
+    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
+    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
+    {
+        HubTokenizerConfig::from_file(filename)
+    } else {
+        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
+    };
+
+    tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok())
 }
 
 #[tokio::main]
@@ -83,10 +213,10 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         otlp_endpoint,
         otlp_service_name,
         cors_allow_origin,
-        messages_api_enabled,
         max_client_batch_size,
         auth_token,
         executor_worker,
+        usage_stats,
     } = args;
 
     // Launch Tokio runtime
@@ -124,18 +254,26 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         )));
     }
 
-    // Run server
-    let tokenizer = Tokenizer::from_pretrained(
-        tokenizer_name.clone(),
-        Some(FromPretrainedParameters {
-            revision: revision.clone().unwrap_or(String::from("main")),
-            user_agent: HashMap::new(),
-            auth_token,
-        }),
+    // Create the backend
+    let tokenizer = get_tokenizer(
+        &tokenizer_name,
+        tokenizer_config_path.as_deref(),
+        revision.as_deref(),
     )
-    .map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?;
+    .await
+    .expect("Failed to retrieve tokenizer implementation");
+
+    info!("Successfully retrieved tokenizer {}", &tokenizer_name);
+    let backend = TensorRtLlmBackendV2::new(
+        tokenizer,
+        model_id,
+        executor_worker,
+        max_concurrent_requests,
+    )?;
 
-    let backend = TensorRtLlmBackend::new(tokenizer, model_id, executor_worker)?;
+    info!("Successfully created backend");
+
+    // Run server
     server::run(
         backend,
         max_concurrent_requests,
@@ -145,7 +283,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         max_input_tokens,
         max_total_tokens,
         validation_workers,
-        None,
+        auth_token,
         tokenizer_name,
         tokenizer_config_path,
         revision,
@@ -155,11 +293,9 @@ async fn main() -> Result<(), TensorRtLlmBackendError> {
         false,
         None,
         None,
-        messages_api_enabled,
         true,
         max_client_batch_size,
-        false,
-        false,
+        usage_stats,
     )
     .await?;
     Ok(())
diff --git a/backends/trtllm/src/utils.rs b/backends/trtllm/src/utils.rs
new file mode 100644
index 00000000000..4dedb007863
--- /dev/null
+++ b/backends/trtllm/src/utils.rs
@@ -0,0 +1,22 @@
+///
+/// Extract the first line of the provided string reference.
+/// If there is no lines in the buffer, it returns a string
+/// which content is defined by the content of `fail`
+/// # Arguments
+///
+/// * `s`: The string buffer to extract the first-line from
+/// * `fail`: A string content which is returned if no lines are
+/// present in `s`
+///
+/// returns: String
+///
+/// # Examples
+///
+/// ```
+/// let s = "My name is Morgan.\n I'm working at Hugging Face.";
+/// first_line(s, "No line in string");
+/// ```
+#[inline]
+pub(crate) fn first_line(s: &str, fail: &str) -> String {
+    s.lines().next().unwrap_or(fail).to_string()
+}
diff --git a/backends/v2/src/backend.rs b/backends/v2/src/backend.rs
index 086fc6dc4a6..bc264138d28 100644
--- a/backends/v2/src/backend.rs
+++ b/backends/v2/src/backend.rs
@@ -6,7 +6,7 @@ use nohash_hasher::IntMap;
 use std::sync::Arc;
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
-use text_generation_router::{Attention, FinishReason, PrefillToken, Token};
+use text_generation_router::{FinishReason, PrefillToken, Token};
 use tokio::sync::mpsc::error::SendError;
 use tokio::sync::{mpsc, Notify};
 use tokio::time::Instant;
@@ -36,18 +36,14 @@ impl BackendV2 {
         speculate: u32,
     ) -> Self {
         // Infer shared state
-        let attention = if let Ok(attention) = std::env::var("ATTENTION") {
-            attention
-                .parse()
-                .unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"))
-        } else {
-            Attention::Paged
-        };
-        let block_size = if attention == Attention::FlashDecoding {
-            256
-        } else {
-            16
+        let attention = std::env::var("ATTENTION").unwrap_or("paged".to_string());
+        let block_size = match attention.as_str() {
+            "flashinfer" => 1,
+            "flashdecoding" => 256,
+            "paged" => 16,
+            _ => unreachable!(),
         };
+
         let queue = Queue::new(requires_padding, block_size, window_size, speculate);
         let batching_task_notifier = Arc::new(Notify::new());
 
diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs
index f53d898e659..ab4b7ce1d97 100644
--- a/backends/v2/src/main.rs
+++ b/backends/v2/src/main.rs
@@ -44,6 +44,8 @@ struct Args {
     tokenizer_config_path: Option<String>,
     #[clap(long, env)]
     revision: Option<String>,
+    #[clap(long, env, value_enum)]
+    trust_remote_code: bool,
     #[clap(default_value = "2", long, env)]
     validation_workers: usize,
     #[clap(long, env)]
@@ -63,8 +65,6 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
     disable_grammar_support: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
@@ -101,6 +101,7 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         validation_workers,
         api_key,
         json_output,
@@ -110,7 +111,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
@@ -184,13 +184,13 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         hostname,
         port,
         cors_allow_origin,
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs
index f8a10ca2606..a5c0f5125b2 100644
--- a/backends/v3/src/backend.rs
+++ b/backends/v3/src/backend.rs
@@ -1,12 +1,14 @@
-use crate::client::{Batch, CachedBatch, ClientError, Generation, Health, ShardedClient};
 /// Batching and inference logic
+use crate::client::{
+    Batch, CachedBatch, ClientError, Generation, Health, InfoResponse, ShardedClient,
+};
 use crate::queue::{Entry, Queue};
 use async_trait::async_trait;
 use nohash_hasher::IntMap;
 use std::sync::Arc;
 use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse};
 use text_generation_router::validation::ValidGenerateRequest;
-use text_generation_router::{Attention, FinishReason, PrefillToken, Token};
+use text_generation_router::{FinishReason, PrefillToken, Token};
 use tokio::sync::mpsc::error::SendError;
 use tokio::sync::{mpsc, Notify};
 use tokio::time::Instant;
@@ -31,27 +33,22 @@ impl BackendV3 {
         max_batch_total_tokens: u32,
         max_waiting_tokens: usize,
         max_batch_size: Option<usize>,
-        requires_padding: bool,
-        window_size: Option<u32>,
-        speculate: u32,
+        shard_info: InfoResponse,
     ) -> Self {
-        let prefix_caching =
-            std::env::var("USE_PREFIX_CACHING").expect("Expect prefix caching env var");
-        let prefix_caching = matches!(prefix_caching.as_str(), "true" | "1");
-        let attention: String = std::env::var("ATTENTION").expect("attention env var");
+        if shard_info.support_chunking {
+            tracing::warn!("Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored.");
+        }
 
-        let attention: Attention = attention
-            .parse()
-            .unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`"));
-        let block_size = attention.block_size();
+        let block_size = shard_info.block_size;
 
         let queue = Queue::new(
-            requires_padding,
+            shard_info.requires_padding,
             block_size,
-            prefix_caching,
-            window_size,
-            speculate,
+            shard_info.use_prefix_caching,
+            shard_info.window_size,
+            shard_info.speculate,
             max_batch_total_tokens,
+            shard_info.support_chunking,
         );
         let batching_task_notifier = Arc::new(Notify::new());
 
@@ -63,6 +60,7 @@ impl BackendV3 {
             max_batch_total_tokens,
             max_waiting_tokens,
             max_batch_size,
+            shard_info.support_chunking,
             queue.clone(),
             batching_task_notifier.clone(),
         ));
@@ -127,6 +125,7 @@ pub(crate) async fn batching_task(
     max_batch_total_tokens: u32,
     max_waiting_tokens: usize,
     max_batch_size: Option<usize>,
+    support_chunking: bool,
     queue: Queue,
     notifier: Arc<Notify>,
 ) {
@@ -147,7 +146,7 @@ pub(crate) async fn batching_task(
             )
             .await
         {
-            let mut cached_batch = prefill(&mut client, batch, &mut entries)
+            let mut cached_batch = prefill(&mut client, batch, None, &mut entries)
                 .instrument(span)
                 .await;
             let mut waiting_tokens = 1;
@@ -158,28 +157,44 @@ pub(crate) async fn batching_task(
                 // Get current batch info
                 let batch_size = batch.size;
                 let batch_max_tokens = batch.max_tokens;
+                let current_tokens = batch.current_tokens;
                 let mut batches = vec![batch];
                 metrics::gauge!("tgi_batch_current_size").set(batch_size as f64);
                 metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64);
 
-                let min_size = if waiting_tokens >= max_waiting_tokens {
-                    // If we didn't onboard any new requests since >= max_waiting_tokens, we try
-                    // to add a new batch even though its size might be small
-                    None
+                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
+
+                let (min_size, max_size, prefill_token_budget) = if support_chunking {
+                    // Since the next batch will be concatenated with the current batch,
+                    // the current batch tokens must be subtracted to the prefill budget
+                    let prefill_token_budget =
+                        max_batch_prefill_tokens.saturating_sub(current_tokens);
+                    // We can ignore min_size and max_size
+                    // Models than rely on max_size cannot support chunking
+                    // Regarding min_size, chunking allow us to consistently run at the compute
+                    // bound, making min_size useless.
+                    (None, None, prefill_token_budget)
                 } else {
-                    // Minimum batch size
-                    // TODO: temporarily disable to avoid incorrect deallocation +
-                    //       reallocation when using prefix caching.
-                    Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
-                };
+                    let min_size = if waiting_tokens >= max_waiting_tokens {
+                        // If we didn't onboard any new requests since >= max_waiting_tokens, we try
+                        // to add a new batch even though its size might be small
+                        None
+                    } else {
+                        // Minimum batch size
+                        // TODO: temporarily disable to avoid incorrect deallocation +
+                        //       reallocation when using prefix caching.
+                        Some((batch_size as f32 * waiting_served_ratio).floor() as usize)
+                    };
 
-                let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens);
-                let max_size =
-                    max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));
+                    let max_size =
+                        max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize));
+
+                    (min_size, max_size, max_batch_prefill_tokens)
+                };
 
                 // Try to get a new batch
-                if let Some((mut new_entries, new_batch, span)) = queue
-                    .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget)
+                if let Some((new_entries, new_batch, span)) = queue
+                    .next_batch(min_size, max_size, prefill_token_budget, token_budget)
                     .await
                 {
                     // Tracking metrics
@@ -187,31 +202,45 @@ pub(crate) async fn batching_task(
                         metrics::counter!("tgi_batch_concat", "reason" => "backpressure")
                             .increment(1);
                     } else {
-                        metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
-                            .increment(1);
+                        let counter = if support_chunking {
+                            metrics::counter!("tgi_batch_concat", "reason" => "chunking")
+                        } else {
+                            metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded")
+                        };
+                        counter.increment(1);
                     }
-
-                    entries.iter_mut().for_each(|(_, entry)| {
-                        // Create a new span to add the info that this entry is waiting
-                        // because a new batch is being computed
-                        let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
-                        // Add relationships
-                        span.follows_from(&entry_waiting_span);
-                        entry_waiting_span.follows_from(&span);
-                        // Update entry
-                        entry.temp_span = Some(entry_waiting_span);
-                    });
+                    let cached_batch = if support_chunking {
+                        // Concat current batch to the new one
+                        batches.pop()
+                    } else {
+                        // Request are waiting only if we don't support chunking
+                        entries.iter_mut().for_each(|(_, entry)| {
+                            // Create a new span to add the info that this entry is waiting
+                            // because a new batch is being computed
+                            let entry_waiting_span = info_span!(parent: &entry.span, "waiting");
+                            // Add relationships
+                            span.follows_from(&entry_waiting_span);
+                            entry_waiting_span.follows_from(&span);
+                            // Update entry
+                            entry.temp_span = Some(entry_waiting_span);
+                        });
+                        None
+                    };
+                    entries.extend(new_entries);
 
                     // Generate one token for this new batch to have the attention past in cache
-                    let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries)
-                        .instrument(span)
-                        .await;
+                    let new_cached_batch =
+                        prefill(&mut client, new_batch, cached_batch, &mut entries)
+                            .instrument(span)
+                            .await;
                     // Reset waiting counter
                     waiting_tokens = 1;
                     // Extend current batch with the new batch
                     if let Some(new_cached_batch) = new_cached_batch {
-                        entries.extend(new_entries);
                         batches.push(new_cached_batch);
+                    } else if support_chunking {
+                        // New cached batch is empty, no work left
+                        break;
                     }
                 }
 
@@ -244,13 +273,14 @@ pub(crate) async fn batching_task(
 async fn prefill(
     client: &mut ShardedClient,
     batch: Batch,
+    cached_batch: Option<CachedBatch>,
     entries: &mut IntMap<u64, Entry>,
 ) -> Option<CachedBatch> {
     let start_time = Instant::now();
     let batch_id = batch.id;
     metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1);
 
-    match client.prefill(batch).await {
+    match client.prefill(batch, cached_batch).await {
         Ok((generations, next_batch, timings)) => {
             let start_filtering_time = Instant::now();
             // Send generated tokens and filter stopped entries
@@ -259,6 +289,10 @@ async fn prefill(
             // Filter next batch and remove requests that were stopped
             let next_batch = filter_batch(client, next_batch, entries).await;
 
+            if let Some(concat_duration) = timings.concat {
+                metrics::histogram!("tgi_batch_concat_duration", "method" => "decode")
+                    .record(concat_duration.as_secs_f64());
+            }
             metrics::histogram!("tgi_batch_forward_duration", "method" => "prefill")
                 .record(timings.forward.as_secs_f64());
             metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill")
diff --git a/backends/v3/src/client/grpc_client.rs b/backends/v3/src/client/grpc_client.rs
index 648662db39b..fe810f24742 100644
--- a/backends/v3/src/client/grpc_client.rs
+++ b/backends/v3/src/client/grpc_client.rs
@@ -158,7 +158,8 @@ impl Client {
                 // Blocks and slots will be set on the server side if we use paged attention
                 blocks: vec![],
                 slots: vec![],
-                prefix_len: 0,
+                cache_len: 0,
+                chunk_len: None,
                 // Set sampling parameters to also take these ops into account in the max memory
                 parameters: Some(NextTokenChooserParameters {
                     temperature: 0.9,
@@ -217,13 +218,23 @@ impl Client {
     pub async fn prefill(
         &mut self,
         batch: Batch,
+        cached_batch: Option<CachedBatch>,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
-        let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context();
+        let request = tonic::Request::new(PrefillRequest {
+            batch: Some(batch),
+            cached_batch,
+        })
+        .inject_context();
         let response = self.stub.prefill(request).await?.into_inner();
         Ok((
             response.generations,
             response.batch,
-            PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns),
+            PrefillTimings::new(
+                response.concat_ns,
+                response.forward_ns,
+                response.decode_ns,
+                response.total_ns,
+            ),
         ))
     }
 
@@ -252,14 +263,16 @@ impl Client {
 }
 
 pub struct PrefillTimings {
+    pub concat: Option<Duration>,
     pub forward: Duration,
     pub decode: Duration,
     pub total: Duration,
 }
 
 impl PrefillTimings {
-    fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
+    fn new(concat_ns: Option<u64>, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self {
         Self {
+            concat: concat_ns.map(Duration::from_nanos),
             forward: Duration::from_nanos(forward_ns),
             decode: Duration::from_nanos(decode_ns),
             total: Duration::from_nanos(total_ns),
diff --git a/backends/v3/src/client/mod.rs b/backends/v3/src/client/mod.rs
index 755431f4633..d4ac50c9c46 100644
--- a/backends/v3/src/client/mod.rs
+++ b/backends/v3/src/client/mod.rs
@@ -29,15 +29,6 @@ pub trait Health {
     async fn model_health(&self) -> Result<()>;
 }
 
-#[derive(Debug)]
-pub struct ShardInfo {
-    pub requires_padding: bool,
-    pub dtype: String,
-    pub device_type: String,
-    pub window_size: Option<u32>,
-    pub speculate: u32,
-}
-
 #[derive(Error, Debug, Clone)]
 pub enum ClientError {
     #[error("Could not connect to Text Generation server: {0}")]
diff --git a/backends/v3/src/client/sharded_client.rs b/backends/v3/src/client/sharded_client.rs
index ea77a696648..e181cd28d2f 100644
--- a/backends/v3/src/client/sharded_client.rs
+++ b/backends/v3/src/client/sharded_client.rs
@@ -1,6 +1,6 @@
-use crate::client::{ClientError, Result};
+use crate::client::Health;
 /// Multi shard Client
-use crate::client::{Health, ShardInfo};
+use crate::client::{ClientError, Result};
 
 use crate::client::grpc_client::{DecodeTimings, PrefillTimings};
 use crate::client::{
@@ -49,13 +49,13 @@ impl ShardedClient {
 
     /// Get the model info
     #[instrument(skip(self))]
-    pub async fn info(&mut self) -> Result<ShardInfo> {
+    pub async fn info(&mut self) -> Result<InfoResponse> {
         let futures: Vec<_> = self
             .clients
             .iter_mut()
             .map(|client| client.info())
             .collect();
-        join_all(futures).await.pop().unwrap().map(ShardInfo::from)
+        join_all(futures).await.pop().unwrap()
     }
 
     /// GRPC health check
@@ -135,11 +135,12 @@ impl ShardedClient {
     pub async fn prefill(
         &mut self,
         batch: Batch,
+        cached_batch: Option<CachedBatch>,
     ) -> Result<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)> {
         let futures: Vec<_> = self
             .clients
             .iter_mut()
-            .map(|client| Box::pin(client.prefill(batch.clone())))
+            .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone())))
             .collect();
         #[allow(clippy::type_complexity)]
         let results: Result<Vec<(Vec<Generation>, Option<CachedBatch>, PrefillTimings)>> =
@@ -194,18 +195,6 @@ impl ShardedClient {
     }
 }
 
-impl From<InfoResponse> for ShardInfo {
-    fn from(value: InfoResponse) -> Self {
-        Self {
-            requires_padding: value.requires_padding,
-            dtype: value.dtype,
-            device_type: value.device_type,
-            window_size: value.window_size,
-            speculate: value.speculate,
-        }
-    }
-}
-
 #[async_trait]
 impl Health for ShardedClient {
     async fn device_health(&self) -> Result<()> {
@@ -246,8 +235,9 @@ impl Health for ShardedClient {
             // Block 0 is reserved for health checks
             blocks: vec![0],
             slots: (0..16).collect(),
-            prefix_len: 0,
+            cache_len: 0,
             adapter_id: None,
+            chunk_len: None,
         };
         let batch = Batch {
             id: u64::MAX,
@@ -256,7 +246,7 @@ impl Health for ShardedClient {
             max_tokens: 2,
             max_blocks: 1,
         };
-        self.clone().prefill(batch).await?;
+        self.clone().prefill(batch, None).await?;
         Ok(())
     }
 }
diff --git a/backends/v3/src/lib.rs b/backends/v3/src/lib.rs
index af66b21eb22..7daf9eaeca7 100644
--- a/backends/v3/src/lib.rs
+++ b/backends/v3/src/lib.rs
@@ -29,6 +29,14 @@ pub struct BackendInfo {
     pub max_waiting_tokens: usize,
     #[schema(nullable = true, example = "null")]
     pub max_batch_size: Option<usize>,
+    #[schema(example = "false")]
+    pub support_chunking: bool,
+    #[schema(example = "false")]
+    pub prefix_caching: bool,
+    #[schema(example = "flashinfer")]
+    pub attention_impl: String,
+    #[schema(example = "1")]
+    pub block_size: u32,
 }
 
 #[allow(clippy::too_many_arguments)]
@@ -110,6 +118,10 @@ pub async fn connect_backend(
         model_device_type: shard_info.device_type.clone(),
         model_dtype: shard_info.dtype.clone(),
         speculate: shard_info.speculate as usize,
+        support_chunking: shard_info.support_chunking,
+        prefix_caching: shard_info.use_prefix_caching,
+        attention_impl: shard_info.attention_impl.clone(),
+        block_size: shard_info.block_size,
     };
 
     let backend = BackendV3::new(
@@ -119,9 +131,7 @@ pub async fn connect_backend(
         max_batch_total_tokens,
         max_waiting_tokens,
         max_batch_size,
-        shard_info.requires_padding,
-        shard_info.window_size,
-        shard_info.speculate,
+        shard_info,
     );
 
     tracing::info!("Using backend V3");
diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs
index 471ddb5a70a..bc4bdb934eb 100644
--- a/backends/v3/src/main.rs
+++ b/backends/v3/src/main.rs
@@ -44,6 +44,8 @@ struct Args {
     tokenizer_config_path: Option<String>,
     #[clap(long, env)]
     revision: Option<String>,
+    #[clap(long, env, value_enum)]
+    trust_remote_code: bool,
     #[clap(default_value = "2", long, env)]
     validation_workers: usize,
     #[clap(long, env)]
@@ -63,8 +65,6 @@ struct Args {
     #[clap(long, env)]
     ngrok_edge: Option<String>,
     #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
     disable_grammar_support: bool,
     #[clap(default_value = "4", long, env)]
     max_client_batch_size: usize,
@@ -101,6 +101,7 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         validation_workers,
         api_key,
         json_output,
@@ -110,7 +111,6 @@ async fn main() -> Result<(), RouterError> {
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
@@ -131,25 +131,12 @@ async fn main() -> Result<(), RouterError> {
             "`max_input_tokens` must be < `max_total_tokens`".to_string(),
         ));
     }
-    if max_input_tokens as u32 > max_batch_prefill_tokens {
-        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
-    }
 
     if validation_workers == 0 {
         return Err(RouterError::ArgumentValidation(
             "`validation_workers` must be > 0".to_string(),
         ));
     }
-
-    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
-        if max_batch_prefill_tokens > *max_batch_total_tokens {
-            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
-        }
-        if max_total_tokens as u32 > *max_batch_total_tokens {
-            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
-        }
-    }
-
     if let Some(max_batch_size) = max_batch_size {
         if max_batch_size == 0 {
             return Err(RouterError::ArgumentValidation(
@@ -158,7 +145,7 @@ async fn main() -> Result<(), RouterError> {
         }
     }
 
-    let (backend, _backend_info) = connect_backend(
+    let (backend, backend_info) = connect_backend(
         max_input_tokens,
         max_total_tokens,
         master_shard_uds_path,
@@ -170,6 +157,19 @@ async fn main() -> Result<(), RouterError> {
     )
     .await?;
 
+    // Validate remaining args now that the backend is known
+    let support_chunking = backend_info.support_chunking;
+    let max_batch_total_tokens = backend_info.max_batch_total_tokens;
+    if max_input_tokens as u32 > max_batch_prefill_tokens && !support_chunking {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
+    }
+    if max_batch_prefill_tokens > max_batch_total_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
+    }
+    if max_total_tokens as u32 > max_batch_total_tokens {
+        return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
+    }
+
     // Run server
     server::run(
         backend,
@@ -184,13 +184,13 @@ async fn main() -> Result<(), RouterError> {
         tokenizer_name,
         tokenizer_config_path,
         revision,
+        trust_remote_code,
         hostname,
         port,
         cors_allow_origin,
         ngrok,
         ngrok_authtoken,
         ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         usage_stats,
diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs
index f8123b57aa2..6662b8de1f9 100644
--- a/backends/v3/src/queue.rs
+++ b/backends/v3/src/queue.rs
@@ -4,7 +4,7 @@ use crate::client::{
     Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters,
 };
 use nohash_hasher::{BuildNoHashHasher, IntMap};
-use std::cmp::{max, min};
+use std::cmp::max;
 use std::collections::VecDeque;
 use text_generation_router::infer::InferError;
 use text_generation_router::infer::InferStreamResponse;
@@ -50,6 +50,7 @@ impl Queue {
         window_size: Option<u32>,
         speculate: u32,
         max_batch_total_tokens: u32,
+        support_chunking: bool,
     ) -> Self {
         // Create channel
         let (queue_sender, queue_receiver) = mpsc::unbounded_channel();
@@ -62,6 +63,7 @@ impl Queue {
             window_size,
             speculate,
             max_batch_total_tokens,
+            support_chunking,
             queue_receiver,
         ));
 
@@ -87,6 +89,10 @@ impl Queue {
         prefill_token_budget: u32,
         token_budget: u32,
     ) -> Option<NextBatch> {
+        if prefill_token_budget == 0 || token_budget == 0 {
+            return None;
+        };
+
         // Create response channel
         let (response_sender, response_receiver) = oneshot::channel();
         // Send next batch command to the background task managing the state
@@ -108,6 +114,7 @@ impl Queue {
 }
 
 // Background task responsible of the queue state
+#[allow(clippy::too_many_arguments)]
 async fn queue_task(
     requires_padding: bool,
     block_size: u32,
@@ -115,6 +122,7 @@ async fn queue_task(
     window_size: Option<u32>,
     speculate: u32,
     max_batch_total_tokens: u32,
+    support_chunking: bool,
     mut receiver: mpsc::UnboundedReceiver<QueueCommand>,
 ) {
     let mut state = State::new(
@@ -124,6 +132,7 @@ async fn queue_task(
         window_size,
         speculate,
         max_batch_total_tokens,
+        support_chunking,
     );
 
     while let Some(cmd) = receiver.recv().await {
@@ -166,12 +175,14 @@ struct State {
     /// Paged Attention block size
     block_size: u32,
 
-    /// Sliding window
-    window_size: Option<u32>,
-
     /// Speculation amount
     speculate: u32,
 
+    /// Whether the model allow the prefill chunking
+    /// If it does, the last request in the batch will be split to exactly match the prefill
+    /// token budget
+    support_chunking: bool,
+
     /// Paged Attention Block Allocation
     block_allocator: Option<BlockAllocator>,
 }
@@ -184,6 +195,7 @@ impl State {
         window_size: Option<u32>,
         speculate: u32,
         max_batch_total_tokens: u32,
+        support_chunking: bool,
     ) -> Self {
         let block_allocator = (!requires_padding).then(|| {
             BlockAllocator::new(
@@ -199,8 +211,8 @@ impl State {
             next_id: 0,
             next_batch_id: 0,
             block_size,
-            window_size,
             speculate,
+            support_chunking,
             block_allocator,
         }
     }
@@ -287,32 +299,7 @@ impl State {
                     }
                     None
                 }
-                Some(_block_allocator) => {
-                    prefill_tokens += entry.request.input_length;
-                    let max_new_tokens = match self.window_size {
-                        None => entry.request.stopping_parameters.max_new_tokens,
-                        Some(window_size) => min(
-                            window_size.saturating_sub(entry.request.input_length),
-                            entry.request.stopping_parameters.max_new_tokens,
-                        ),
-                    };
-                    decode_tokens += max_new_tokens;
-
-                    if prefill_tokens > prefill_token_budget
-                        || (prefill_tokens + decode_tokens + self.speculate) > token_budget
-                    {
-                        // Entry is over budget
-                        // Add it back to the front
-                        tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate);
-                        self.entries.push_front((id, entry));
-                        break;
-                    }
-
-                    let tokens = entry.request.input_length
-                        + entry.request.stopping_parameters.max_new_tokens
-                        + self.speculate
-                        - 1;
-
+                Some(block_allocator) => {
                     // If users wants the prefill logprobs, we cannot reuse the cache.
                     // So no input_ids for the radix tree.
                     let input_ids = if entry.request.decoder_input_details {
@@ -321,10 +308,73 @@ impl State {
                         entry.request.input_ids.clone()
                     };
 
-                    Some((tokens, input_ids))
+                    let tokens = entry.request.input_length
+                        + entry.request.stopping_parameters.max_new_tokens
+                        + self.speculate
+                        - 1;
+                    tracing::debug!("Allocating {tokens} with {input_ids:?}");
+
+                    let block_allocation = match block_allocator.allocate(tokens, input_ids).await {
+                        None => {
+                            // Entry is over budget
+                            // Add it back to the front
+                            tracing::debug!("Over budget: not enough free blocks");
+                            self.entries.push_front((id, entry));
+                            break 'entry_loop;
+                        }
+                        Some(mut block_allocation) => {
+                            tracing::debug!("Allocation: {block_allocation:?}");
+                            max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);
+
+                            if block_allocation.prefix_len == entry.request.input_length {
+                                // The whole request was found in the radix trie
+                                // However, for the transformer forward to work, we need to
+                                // have at least one token of postfix.
+                                block_allocation.prefix_len -= 1;
+                            }
+
+                            block_allocation
+                        }
+                    };
+
+                    let postfix_len = entry.request.input_length - block_allocation.prefix_len;
+
+                    if prefill_tokens + postfix_len > prefill_token_budget {
+                        // Entry is over budget
+                        if self.support_chunking {
+                            // We support chunking, just set postfix_len to exactly match prefill_token_budget
+                            let chunk_len = prefill_token_budget.saturating_sub(prefill_tokens);
+                            if chunk_len > 0 {
+                                // Push this entry inside the batch
+                                batch.push((id, entry, Some(block_allocation), Some(chunk_len)));
+                            } else {
+                                // We cannot prefill even one token for this entry
+                                // Add it back to the queue
+                                self.entries.push_front((id, entry));
+                            }
+                            tracing::debug!(
+                                "Matched budget: prefill_tokens={} == {prefill_token_budget}",
+                                prefill_tokens + postfix_len
+                            );
+                            break 'entry_loop;
+                        } else {
+                            // We don't support chunking, this entry needs to go back to the buffer
+                            // Add it back to the front
+                            tracing::debug!(
+                                "Over budget: prefill_tokens={} > {prefill_token_budget}",
+                                prefill_tokens + postfix_len
+                            );
+                            self.entries.push_front((id, entry));
+                            break 'entry_loop;
+                        }
+                    }
+
+                    prefill_tokens += postfix_len;
+
+                    Some(block_allocation)
                 }
             };
-            batch.push((id, entry, block_allocation));
+            batch.push((id, entry, block_allocation, None));
             if Some(batch.len()) == max_size {
                 break;
             }
@@ -342,7 +392,7 @@ impl State {
             // Batch is too small
             if batch.len() < min_size {
                 // Add back entries to the queue in the correct order
-                for (id, entry, _) in batch.into_iter().rev() {
+                for (id, entry, _, _) in batch.into_iter().rev() {
                     self.entries.push_front((id, entry));
                 }
                 return None;
@@ -353,29 +403,7 @@ impl State {
         let mut batch_entries =
             IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default());
 
-        for (id, mut entry, block_allocation) in batch {
-            let block_allocation = if let (Some((tokens, input_ids)), Some(block_allocator)) =
-                (block_allocation, &self.block_allocator)
-            {
-                tracing::debug!("Allocating {tokens} with {input_ids:?}");
-                match block_allocator.allocate(tokens, input_ids).await {
-                    None => {
-                        // Entry is over budget
-                        // Add it back to the front
-                        tracing::debug!("Over budget: not enough free blocks");
-                        self.entries.push_front((id, entry));
-                        continue;
-                    }
-                    Some(block_allocation) => {
-                        tracing::debug!("Allocation: {block_allocation:?}");
-                        max_blocks = max(max_blocks, block_allocation.blocks.len() as u32);
-                        Some(block_allocation)
-                    }
-                }
-            } else {
-                None
-            };
-            tracing::debug!("Accepting entry");
+        for (id, mut entry, block_allocation, chunk_len) in batch {
             // Create a new span to link the batch back to this entry
             let entry_batch_span = info_span!(parent: &entry.span, "infer");
             // Add relationships
@@ -427,8 +455,9 @@ impl State {
                 top_n_tokens: entry.request.top_n_tokens,
                 blocks,
                 slots,
-                prefix_len,
+                cache_len: prefix_len,
                 adapter_id: entry.request.adapter_id.clone(),
+                chunk_len,
             });
             // Set batch_time
             entry.batch_time = Some(Instant::now());
@@ -436,12 +465,6 @@ impl State {
             batch_entries.insert(id, entry);
         }
 
-        // Empty batch
-        if batch_requests.is_empty() {
-            tracing::debug!("Filterered out all entries");
-            return None;
-        }
-
         // Final batch size
         let size = batch_requests.len() as u32;
         next_batch_span.record("batch_size", size);
@@ -531,7 +554,7 @@ mod tests {
             request: ValidGenerateRequest {
                 inputs: vec![],
                 input_ids: Some(Arc::new(vec![])),
-                input_length: 0,
+                input_length: 1,
                 add_special_tokens: true,
                 truncate: 0,
                 decoder_input_details: false,
@@ -567,7 +590,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_append() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry, _guard) = default_entry();
 
         assert_eq!(state.next_id, 0);
@@ -583,7 +606,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_empty() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
 
         assert!(state.next_batch(None, None, 1, 1).await.is_none());
         assert!(state.next_batch(Some(1), None, 1, 1).await.is_none());
@@ -591,7 +614,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_min_size() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -623,7 +646,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_max_size() {
-        let mut state = State::new(false, 1, false, None, 0, 16);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -643,7 +666,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_next_batch_token_budget() {
-        let mut state = State::new(false, 1, false, None, 0, 2);
+        let mut state = State::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         state.append(entry1);
@@ -676,14 +699,14 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_append() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry, _guard) = default_entry();
         queue.append(entry);
     }
 
     #[tokio::test]
     async fn test_queue_next_batch_empty() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
 
         assert!(queue.next_batch(None, None, 1, 1).await.is_none());
         assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none());
@@ -691,7 +714,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_min_size() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -724,7 +747,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_max_size() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -740,7 +763,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_token_budget() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -765,7 +788,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_token_speculate() {
-        let queue = Queue::new(false, 1, false, None, 2, 16);
+        let queue = Queue::new(true, 1, false, None, 2, 16, false);
         let (entry1, _guard1) = default_entry();
         let (entry2, _guard2) = default_entry();
         queue.append(entry1);
@@ -784,7 +807,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_queue_next_batch_dropped_receiver() {
-        let queue = Queue::new(false, 1, false, None, 0, 16);
+        let queue = Queue::new(false, 1, false, None, 0, 16, false);
         let (entry, _) = default_entry();
         queue.append(entry);
 
diff --git a/benchmark/src/generation.rs b/benchmark/src/generation.rs
index 789c7b514fc..63fc780818b 100644
--- a/benchmark/src/generation.rs
+++ b/benchmark/src/generation.rs
@@ -158,7 +158,8 @@ async fn prefill(
             top_n_tokens: top_n_tokens.unwrap_or(0),
             blocks: vec![],
             slots: vec![],
-            prefix_len: 0,
+            cache_len: 0,
+            chunk_len: None,
             adapter_id: None,
         })
         .collect();
@@ -173,7 +174,7 @@ async fn prefill(
 
     // Run prefill
     let start_time = Instant::now();
-    let (_, decode_batch, _) = client.prefill(batch.clone()).await?;
+    let (_, decode_batch, _) = client.prefill(batch.clone(), None).await?;
 
     // Get latency
     let latency = start_time.elapsed();
diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs
index 2ee3d7c551a..2e2e9a11c12 100644
--- a/benchmark/src/main.rs
+++ b/benchmark/src/main.rs
@@ -178,6 +178,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
                 .clear_cache(None)
                 .await
                 .expect("Unable to clear cache");
+
             tracing::info!("Connected");
 
             // Run app
diff --git a/docs/openapi.json b/docs/openapi.json
index 957fe246b27..e7da2d40c54 100644
--- a/docs/openapi.json
+++ b/docs/openapi.json
@@ -316,6 +316,98 @@
         }
       }
     },
+    "/invocations": {
+      "post": {
+        "tags": [
+          "Text Generation Inference"
+        ],
+        "summary": "Generate tokens from Sagemaker request",
+        "operationId": "sagemaker_compatibility",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/SagemakerRequest"
+              }
+            }
+          },
+          "required": true
+        },
+        "responses": {
+          "200": {
+            "description": "Generated Chat Completion",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerResponse"
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "$ref": "#/components/schemas/SagemakerStreamResponse"
+                }
+              }
+            }
+          },
+          "422": {
+            "description": "Input validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Input validation error",
+                  "error_type": "validation"
+                }
+              }
+            }
+          },
+          "424": {
+            "description": "Generation Error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Request failed during generation",
+                  "error_type": "generation"
+                }
+              }
+            }
+          },
+          "429": {
+            "description": "Model is overloaded",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Model is overloaded",
+                  "error_type": "overloaded"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Incomplete generation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                },
+                "example": {
+                  "error": "Incomplete generation",
+                  "error_type": "incomplete_generation"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
     "/metrics": {
       "get": {
         "tags": [
@@ -1865,6 +1957,45 @@
           "type": "string"
         }
       },
+      "SagemakerRequest": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/CompatGenerateRequest"
+          },
+          {
+            "$ref": "#/components/schemas/ChatRequest"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionRequest"
+          }
+        ]
+      },
+      "SagemakerResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/GenerateResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletion"
+          },
+          {
+            "$ref": "#/components/schemas/CompletionFinal"
+          }
+        ]
+      },
+      "SagemakerStreamResponse": {
+        "oneOf": [
+          {
+            "$ref": "#/components/schemas/StreamResponse"
+          },
+          {
+            "$ref": "#/components/schemas/ChatCompletionChunk"
+          },
+          {
+            "$ref": "#/components/schemas/Chunk"
+          }
+        ]
+      },
       "SimpleToken": {
         "type": "object",
         "required": [
@@ -2186,4 +2317,4 @@
       "description": "Hugging Face Text Generation Inference API"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md
index 52043c80f8a..45d951bb1e1 100644
--- a/docs/source/reference/api_reference.md
+++ b/docs/source/reference/api_reference.md
@@ -141,9 +141,7 @@ TGI can be deployed on various cloud providers for scalable and robust text gene
 
 ## Amazon SageMaker
 
-To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`.
-
-This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API.
+Amazon Sagemaker natively supports the message API:
 
 ```python
 import json
@@ -161,12 +159,11 @@ except ValueError:
 hub = {
  'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta',
  'SM_NUM_GPUS': json.dumps(1),
- 'MESSAGES_API_ENABLED': True
 }
 
 # create Hugging Face Model Class
 huggingface_model = HuggingFaceModel(
- image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"),
+ image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.2"),
  env=hub,
  role=role,
 )
diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md
index b1abd1ee19b..68e487d0a73 100644
--- a/docs/source/reference/launcher.md
+++ b/docs/source/reference/launcher.md
@@ -93,10 +93,10 @@ Options:
 ## KV_CACHE_DTYPE
 ```shell
       --kv-cache-dtype <KV_CACHE_DTYPE>
-          Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value is `fp8_e5m2` on CUDA
+          Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA
           
           [env: KV_CACHE_DTYPE=]
-          [possible values: fp8_e5m2]
+          [possible values: fp8_e4m3fn, fp8_e5m2]
 
 ```
 ## TRUST_REMOTE_CODE
diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md
index 28008bcd1da..ede1fc778f3 100644
--- a/docs/source/supported_models.md
+++ b/docs/source/supported_models.md
@@ -8,6 +8,7 @@ Text Generation Inference enables serving optimized models. The following sectio
 - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal)
 - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
 - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)
+- [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)
 - [Gemma](https://huggingface.co/google/gemma-7b)
 - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224)
 - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md
index a2c406ecc0f..d3878b53ccb 100644
--- a/docs/source/usage_statistics.md
+++ b/docs/source/usage_statistics.md
@@ -26,7 +26,6 @@ As of release 2.1.2 this is an example of the data collected:
   "max_top_n_tokens": 5,
   "max_total_tokens": 2048,
   "max_waiting_tokens": 20,
-  "messages_api_enabled": false,
   "model_config": {
     "model_type": "Bloom"
   },
diff --git a/flake.lock b/flake.lock
index aacdd30e401..76b4ca2fe38 100644
--- a/flake.lock
+++ b/flake.lock
@@ -978,15 +978,16 @@
         "nixpkgs": "nixpkgs_6"
       },
       "locked": {
-        "lastModified": 1728381423,
-        "narHash": "sha256-gpHy1WtlA8ZTd8XmxsdCoDd4Z7DE7co37lH7P+nsADA=",
+        "lastModified": 1729531056,
+        "narHash": "sha256-dW9IOA31+j3VS19WAWAmkJW2YCzeVZGqd6HpIJfODtI=",
         "owner": "huggingface",
         "repo": "text-generation-inference-nix",
-        "rev": "93123736c97e9f7bfe825bfaf3d7de0fc9a21a1e",
+        "rev": "a84a90281a17b15762873845c947e5c78f5a8dd1",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
+        "ref": "marlin-kernels-0.3.0",
         "repo": "text-generation-inference-nix",
         "type": "github"
       }
diff --git a/flake.nix b/flake.nix
index edef442f2cb..5c05bfae7fb 100644
--- a/flake.nix
+++ b/flake.nix
@@ -5,7 +5,7 @@
       inputs.nixpkgs.follows = "tgi-nix/nixpkgs";
     };
     nix-filter.url = "github:numtide/nix-filter";
-    tgi-nix.url = "github:huggingface/text-generation-inference-nix";
+    tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.0";
     nixpkgs.follows = "tgi-nix/nixpkgs";
     flake-utils.url = "github:numtide/flake-utils";
     rust-overlay = {
@@ -137,6 +137,11 @@
 
           impure = callPackage ./nix/impure-shell.nix { inherit server; };
 
+          impureWithCuda = callPackage ./nix/impure-shell.nix {
+            inherit server;
+            withCuda = true;
+          };
+
           impure-flash-attn-v1 = callPackage ./nix/impure-shell.nix {
             server = server.override { flash-attn = python3.pkgs.flash-attn-v1; };
           };
diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py
index f24fc0790ce..356fa5e30ac 100644
--- a/integration-tests/conftest.py
+++ b/integration-tests/conftest.py
@@ -9,13 +9,16 @@
 import sys
 import tempfile
 import time
-from typing import Dict, List, Optional
-
 import docker
 import pytest
+import base64
+
+from pathlib import Path
+from typing import Dict, List, Optional
 from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError
 from docker.errors import NotFound
 from syrupy.extensions.json import JSONSnapshotExtension
+
 from text_generation import AsyncClient
 from text_generation.types import (
     BestOfSequence,
@@ -403,6 +406,7 @@ def local_launcher(
         print(" ".join(args), file=sys.stderr)
 
         env["LOG_LEVEL"] = "info,text_generation_router=debug"
+        env["PREFILL_CHUNKING"] = "1"
 
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
@@ -501,6 +505,7 @@ def docker_launcher(
 
         env = {
             "LOG_LEVEL": "info,text_generation_router=debug",
+            "PREFILL_CHUNKING": "1",
         }
         if not use_flash_attention:
             env["USE_FLASH_ATTENTION"] = "false"
@@ -642,3 +647,22 @@ async def generate_load_inner(
         return responses
 
     return generate_load_inner
+
+
+# TODO fix the server parsser to count inline image tokens correctly
+@pytest.fixture
+def chicken():
+    path = Path(__file__).parent / "images" / "chicken_on_money.png"
+
+    with open(path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
+
+
+@pytest.fixture
+def cow_beach():
+    path = Path(__file__).parent / "images" / "cow_beach.png"
+
+    with open(path, "rb") as image_file:
+        encoded_string = base64.b64encode(image_file.read())
+    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json
index c55dd593a1d..b82882c00b6 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json
@@ -11,27 +11,27 @@
       },
       {
         "id": 3923,
-        "logprob": -5.6328125,
+        "logprob": -6.1875,
         "text": "What"
       },
       {
         "id": 374,
-        "logprob": -1.2265625,
+        "logprob": -0.93359375,
         "text": " is"
       },
       {
         "id": 5655,
-        "logprob": -9.1015625,
+        "logprob": -9.875,
         "text": " deep"
       },
       {
         "id": 6975,
-        "logprob": -1.8085938,
+        "logprob": -1.1796875,
         "text": " learning"
       },
       {
         "id": 30,
-        "logprob": -1.0439453,
+        "logprob": -1.75,
         "text": "?"
       }
     ],
@@ -39,66 +39,66 @@
     "tokens": [
       {
         "id": 18682,
-        "logprob": -2.1992188,
+        "logprob": -1.109375,
         "special": false,
         "text": " Deep"
       },
       {
         "id": 6975,
-        "logprob": -0.079956055,
+        "logprob": -0.005432129,
         "special": false,
         "text": " learning"
       },
       {
         "id": 374,
-        "logprob": -0.2763672,
+        "logprob": -0.028808594,
         "special": false,
         "text": " is"
       },
       {
         "id": 264,
-        "logprob": -0.37548828,
+        "logprob": -0.013671875,
         "special": false,
         "text": " a"
       },
       {
         "id": 27084,
-        "logprob": -1.4628906,
+        "logprob": -0.69921875,
         "special": false,
         "text": " subset"
       },
       {
         "id": 315,
-        "logprob": -0.02885437,
+        "logprob": -0.0005874634,
         "special": false,
         "text": " of"
       },
       {
         "id": 5780,
-        "logprob": -0.2565918,
+        "logprob": -0.026855469,
         "special": false,
         "text": " machine"
       },
       {
         "id": 6975,
-        "logprob": -0.0063438416,
+        "logprob": -0.00020885468,
         "special": false,
         "text": " learning"
       },
       {
         "id": 430,
-        "logprob": -1.3056641,
+        "logprob": -0.17773438,
         "special": false,
         "text": " that"
       },
       {
-        "id": 374,
-        "logprob": -1.6035156,
+        "id": 18065,
+        "logprob": -0.703125,
         "special": false,
-        "text": " is"
+        "text": " involves"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": " Deep learning is a subset of machine learning that is"
+  "generated_text": " Deep learning is a subset of machine learning that involves"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
index d06d6e5662d..8bce3e108d5 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json
@@ -1,8 +1,8 @@
 {
   "details": {
     "best_of_sequences": null,
-    "finish_reason": "eos_token",
-    "generated_tokens": 3,
+    "finish_reason": "length",
+    "generated_tokens": 10,
     "prefill": [
       {
         "id": 128000,
@@ -11,22 +11,22 @@
       },
       {
         "id": 374,
-        "logprob": -22.96875,
+        "logprob": -18.0,
         "text": " is"
       },
       {
         "id": 5655,
-        "logprob": -10.71875,
+        "logprob": -11.75,
         "text": " deep"
       },
       {
         "id": 6975,
-        "logprob": -2.6992188,
+        "logprob": -2.0625,
         "text": " learning"
       },
       {
         "id": 30,
-        "logprob": -4.8398438,
+        "logprob": -6.0,
         "text": "?"
       }
     ],
@@ -34,24 +34,66 @@
     "tokens": [
       {
         "id": 720,
-        "logprob": -0.4411621,
+        "logprob": 0.0,
         "special": false,
         "text": " \n"
       },
       {
-        "id": 220,
-        "logprob": -0.35864258,
+        "id": 34564,
+        "logprob": -0.11279297,
+        "special": false,
+        "text": "Deep"
+      },
+      {
+        "id": 6975,
+        "logprob": -0.16015625,
         "special": false,
-        "text": " "
+        "text": " learning"
       },
       {
-        "id": 128001,
+        "id": 320,
+        "logprob": -0.25195312,
+        "special": false,
+        "text": " ("
+      },
+      {
+        "id": 16931,
+        "logprob": -1.703125,
+        "special": false,
+        "text": "DL"
+      },
+      {
+        "id": 8,
         "logprob": 0.0,
-        "special": true,
-        "text": "<|end_of_text|>"
+        "special": false,
+        "text": ")"
+      },
+      {
+        "id": 374,
+        "logprob": -1.140625,
+        "special": false,
+        "text": " is"
+      },
+      {
+        "id": 264,
+        "logprob": 0.0,
+        "special": false,
+        "text": " a"
+      },
+      {
+        "id": 1207,
+        "logprob": -1.3125,
+        "special": false,
+        "text": " sub"
+      },
+      {
+        "id": 2630,
+        "logprob": 0.0,
+        "special": false,
+        "text": "field"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is deep learning? \n "
+  "generated_text": "What is deep learning? \nDeep learning (DL) is a subfield"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json
index 46670819f99..c7acee467c6 100644
--- a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json
@@ -12,27 +12,27 @@
         },
         {
           "id": 3923,
-          "logprob": -5.6328125,
+          "logprob": -6.1875,
           "text": "What"
         },
         {
           "id": 374,
-          "logprob": -1.2265625,
+          "logprob": -0.93359375,
           "text": " is"
         },
         {
           "id": 5655,
-          "logprob": -9.1015625,
+          "logprob": -9.875,
           "text": " deep"
         },
         {
           "id": 6975,
-          "logprob": -1.8085938,
+          "logprob": -1.1796875,
           "text": " learning"
         },
         {
           "id": 30,
-          "logprob": -1.0439453,
+          "logprob": -1.75,
           "text": "?"
         }
       ],
@@ -40,68 +40,68 @@
       "tokens": [
         {
           "id": 18682,
-          "logprob": -2.1992188,
+          "logprob": -1.109375,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6975,
-          "logprob": -0.07897949,
+          "logprob": -0.0047912598,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.27734375,
+          "logprob": -0.025512695,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.37402344,
+          "logprob": -0.012145996,
           "special": false,
           "text": " a"
         },
         {
           "id": 27084,
-          "logprob": -1.4511719,
+          "logprob": -0.72265625,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.02909851,
+          "logprob": -0.0005760193,
           "special": false,
           "text": " of"
         },
         {
           "id": 5780,
-          "logprob": -0.25854492,
+          "logprob": -0.02722168,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6975,
-          "logprob": -0.0061798096,
+          "logprob": -0.00023651123,
           "special": false,
           "text": " learning"
         },
         {
           "id": 430,
-          "logprob": -1.3046875,
+          "logprob": -0.17285156,
           "special": false,
           "text": " that"
         },
         {
-          "id": 374,
-          "logprob": -1.5537109,
+          "id": 18065,
+          "logprob": -0.703125,
           "special": false,
-          "text": " is"
+          "text": " involves"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": " Deep learning is a subset of machine learning that is"
+    "generated_text": " Deep learning is a subset of machine learning that involves"
   },
   {
     "details": {
@@ -116,27 +116,27 @@
         },
         {
           "id": 3923,
-          "logprob": -5.6328125,
+          "logprob": -6.21875,
           "text": "What"
         },
         {
           "id": 374,
-          "logprob": -1.2265625,
+          "logprob": -0.95703125,
           "text": " is"
         },
         {
           "id": 5655,
-          "logprob": -9.1015625,
+          "logprob": -9.9375,
           "text": " deep"
         },
         {
           "id": 6975,
-          "logprob": -1.8085938,
+          "logprob": -1.1328125,
           "text": " learning"
         },
         {
           "id": 30,
-          "logprob": -1.0439453,
+          "logprob": -1.75,
           "text": "?"
         }
       ],
@@ -144,68 +144,68 @@
       "tokens": [
         {
           "id": 18682,
-          "logprob": -2.1992188,
+          "logprob": -1.1796875,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6975,
-          "logprob": -0.07897949,
+          "logprob": -0.005432129,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.27734375,
+          "logprob": -0.02758789,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.37402344,
+          "logprob": -0.013366699,
           "special": false,
           "text": " a"
         },
         {
           "id": 27084,
-          "logprob": -1.4511719,
+          "logprob": -0.6953125,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.02909851,
+          "logprob": -0.0004863739,
           "special": false,
           "text": " of"
         },
         {
           "id": 5780,
-          "logprob": -0.25854492,
+          "logprob": -0.02709961,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6975,
-          "logprob": -0.0061798096,
+          "logprob": -0.00022506714,
           "special": false,
           "text": " learning"
         },
         {
           "id": 430,
-          "logprob": -1.3046875,
+          "logprob": -0.19726562,
           "special": false,
           "text": " that"
         },
         {
-          "id": 374,
-          "logprob": -1.5537109,
+          "id": 18065,
+          "logprob": -0.77734375,
           "special": false,
-          "text": " is"
+          "text": " involves"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": " Deep learning is a subset of machine learning that is"
+    "generated_text": " Deep learning is a subset of machine learning that involves"
   },
   {
     "details": {
@@ -220,27 +220,27 @@
         },
         {
           "id": 3923,
-          "logprob": -5.6328125,
+          "logprob": -6.21875,
           "text": "What"
         },
         {
           "id": 374,
-          "logprob": -1.2265625,
+          "logprob": -0.95703125,
           "text": " is"
         },
         {
           "id": 5655,
-          "logprob": -9.1015625,
+          "logprob": -9.9375,
           "text": " deep"
         },
         {
           "id": 6975,
-          "logprob": -1.8085938,
+          "logprob": -1.1328125,
           "text": " learning"
         },
         {
           "id": 30,
-          "logprob": -1.0439453,
+          "logprob": -1.75,
           "text": "?"
         }
       ],
@@ -248,68 +248,68 @@
       "tokens": [
         {
           "id": 18682,
-          "logprob": -2.1992188,
+          "logprob": -1.1796875,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6975,
-          "logprob": -0.07897949,
+          "logprob": -0.005432129,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.27734375,
+          "logprob": -0.02758789,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.37402344,
+          "logprob": -0.013366699,
           "special": false,
           "text": " a"
         },
         {
           "id": 27084,
-          "logprob": -1.4511719,
+          "logprob": -0.6953125,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.02909851,
+          "logprob": -0.0004863739,
           "special": false,
           "text": " of"
         },
         {
           "id": 5780,
-          "logprob": -0.25854492,
+          "logprob": -0.02709961,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6975,
-          "logprob": -0.0061798096,
+          "logprob": -0.00022506714,
           "special": false,
           "text": " learning"
         },
         {
           "id": 430,
-          "logprob": -1.3046875,
+          "logprob": -0.19726562,
           "special": false,
           "text": " that"
         },
         {
-          "id": 374,
-          "logprob": -1.5537109,
+          "id": 18065,
+          "logprob": -0.77734375,
           "special": false,
-          "text": " is"
+          "text": " involves"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": " Deep learning is a subset of machine learning that is"
+    "generated_text": " Deep learning is a subset of machine learning that involves"
   },
   {
     "details": {
@@ -324,27 +324,27 @@
         },
         {
           "id": 3923,
-          "logprob": -5.6328125,
+          "logprob": -6.21875,
           "text": "What"
         },
         {
           "id": 374,
-          "logprob": -1.2265625,
+          "logprob": -0.95703125,
           "text": " is"
         },
         {
           "id": 5655,
-          "logprob": -9.1015625,
+          "logprob": -9.9375,
           "text": " deep"
         },
         {
           "id": 6975,
-          "logprob": -1.8085938,
+          "logprob": -1.1328125,
           "text": " learning"
         },
         {
           "id": 30,
-          "logprob": -1.0439453,
+          "logprob": -1.75,
           "text": "?"
         }
       ],
@@ -352,67 +352,67 @@
       "tokens": [
         {
           "id": 18682,
-          "logprob": -2.1992188,
+          "logprob": -1.1796875,
           "special": false,
           "text": " Deep"
         },
         {
           "id": 6975,
-          "logprob": -0.07897949,
+          "logprob": -0.005432129,
           "special": false,
           "text": " learning"
         },
         {
           "id": 374,
-          "logprob": -0.27734375,
+          "logprob": -0.02758789,
           "special": false,
           "text": " is"
         },
         {
           "id": 264,
-          "logprob": -0.37402344,
+          "logprob": -0.013366699,
           "special": false,
           "text": " a"
         },
         {
           "id": 27084,
-          "logprob": -1.4511719,
+          "logprob": -0.6953125,
           "special": false,
           "text": " subset"
         },
         {
           "id": 315,
-          "logprob": -0.02909851,
+          "logprob": -0.0004863739,
           "special": false,
           "text": " of"
         },
         {
           "id": 5780,
-          "logprob": -0.25854492,
+          "logprob": -0.02709961,
           "special": false,
           "text": " machine"
         },
         {
           "id": 6975,
-          "logprob": -0.0061798096,
+          "logprob": -0.00022506714,
           "special": false,
           "text": " learning"
         },
         {
           "id": 430,
-          "logprob": -1.3046875,
+          "logprob": -0.19726562,
           "special": false,
           "text": " that"
         },
         {
-          "id": 374,
-          "logprob": -1.5537109,
+          "id": 18065,
+          "logprob": -0.77734375,
           "special": false,
-          "text": " is"
+          "text": " involves"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": " Deep learning is a subset of machine learning that is"
+    "generated_text": " Deep learning is a subset of machine learning that involves"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json
index 993bdaddc02..b835bf0752d 100644
--- a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json
@@ -10,80 +10,95 @@
         "text": "<s>"
       },
       {
-        "id": 3735,
-        "logprob": -11.0078125,
-        "text": "Test"
+        "id": 1824,
+        "logprob": -9.2890625,
+        "text": "What"
       },
       {
-        "id": 2159,
-        "logprob": -13.59375,
-        "text": "request"
+        "id": 349,
+        "logprob": -1.1503906,
+        "text": "is"
+      },
+      {
+        "id": 3534,
+        "logprob": -9.5859375,
+        "text": "deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -1.3945312,
+        "text": "learning"
+      },
+      {
+        "id": 28804,
+        "logprob": -0.4555664,
+        "text": "?"
       }
     ],
     "seed": null,
     "tokens": [
       {
         "id": 13,
-        "logprob": -1.7089844,
+        "logprob": -0.6953125,
         "special": false,
         "text": "\n"
       },
       {
         "id": 13,
-        "logprob": -0.68847656,
+        "logprob": -0.4777832,
         "special": false,
         "text": "\n"
       },
       {
-        "id": 28771,
-        "logprob": -1.9394531,
+        "id": 23229,
+        "logprob": -0.13256836,
         "special": false,
-        "text": "#"
+        "text": "Deep"
       },
       {
-        "id": 3735,
-        "logprob": -2.8808594,
+        "id": 5168,
+        "logprob": -0.023849487,
         "special": false,
-        "text": " Test"
+        "text": " learning"
       },
       {
-        "id": 2159,
-        "logprob": -0.37280273,
+        "id": 349,
+        "logprob": -0.13977051,
         "special": false,
-        "text": " request"
+        "text": " is"
       },
       {
-        "id": 13,
-        "logprob": -0.26098633,
+        "id": 264,
+        "logprob": -0.14489746,
         "special": false,
-        "text": "\n"
+        "text": " a"
       },
       {
-        "id": 13,
-        "logprob": -0.0017137527,
+        "id": 19804,
+        "logprob": -0.63183594,
         "special": false,
-        "text": "\n"
+        "text": " subset"
       },
       {
-        "id": 1064,
-        "logprob": -2.2695312,
+        "id": 302,
+        "logprob": -0.010314941,
         "special": false,
-        "text": "##"
+        "text": " of"
       },
       {
-        "id": 3735,
-        "logprob": -1.9238281,
+        "id": 5599,
+        "logprob": -0.0635376,
         "special": false,
-        "text": " Test"
+        "text": " machine"
       },
       {
-        "id": 2159,
-        "logprob": -0.48828125,
+        "id": 5168,
+        "logprob": -0.0028572083,
         "special": false,
-        "text": " request"
+        "text": " learning"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "\n\n# Test request\n\n## Test request"
+  "generated_text": "\n\nDeep learning is a subset of machine learning"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json
index 94411eefb40..77c8859907e 100644
--- a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json
@@ -10,80 +10,90 @@
         "text": "<s>"
       },
       {
-        "id": 3735,
-        "logprob": -11.0078125,
-        "text": "Test"
+        "id": 349,
+        "logprob": -12.0546875,
+        "text": "is"
       },
       {
-        "id": 2159,
-        "logprob": -13.59375,
-        "text": "request"
+        "id": 3534,
+        "logprob": -10.53125,
+        "text": "deep"
+      },
+      {
+        "id": 5168,
+        "logprob": -2.71875,
+        "text": "learning"
+      },
+      {
+        "id": 28804,
+        "logprob": -5.0078125,
+        "text": "?"
       }
     ],
     "seed": 0,
     "tokens": [
       {
         "id": 13,
-        "logprob": -0.34838867,
+        "logprob": 0.0,
         "special": false,
         "text": "\n"
       },
       {
-        "id": 13940,
-        "logprob": -0.38916016,
+        "id": 23229,
+        "logprob": -0.18237305,
         "special": false,
-        "text": "``"
+        "text": "Deep"
       },
       {
-        "id": 28832,
+        "id": 17504,
         "logprob": 0.0,
         "special": false,
-        "text": "`"
+        "text": " Learning"
       },
       {
-        "id": 3371,
-        "logprob": -1.2529297,
+        "id": 349,
+        "logprob": 0.0,
         "special": false,
-        "text": "json"
+        "text": " is"
       },
       {
-        "id": 13,
+        "id": 264,
         "logprob": 0.0,
         "special": false,
-        "text": "\n"
+        "text": " a"
       },
       {
-        "id": 28751,
+        "id": 19804,
         "logprob": 0.0,
         "special": false,
-        "text": "{"
+        "text": " subset"
       },
       {
-        "id": 13,
+        "id": 302,
         "logprob": 0.0,
         "special": false,
-        "text": "\n"
+        "text": " of"
       },
       {
-        "id": 2287,
-        "logprob": 0.0,
+        "id": 13253,
+        "logprob": -0.6040039,
         "special": false,
-        "text": "   "
+        "text": " Machine"
       },
       {
-        "id": 345,
+        "id": 17504,
         "logprob": 0.0,
         "special": false,
-        "text": " \""
+        "text": " Learning"
       },
       {
-        "id": 3134,
-        "logprob": -0.640625,
+        "id": 28725,
+        "logprob": -0.11621094,
         "special": false,
-        "text": "request"
+        "text": ","
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Test request\n```json\n{\n    \"request"
+  "generated_text": "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json
index 19e306a38f1..959e3c5578e 100644
--- a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json
@@ -11,82 +11,97 @@
           "text": "<s>"
         },
         {
-          "id": 3735,
-          "logprob": -11.0078125,
-          "text": "Test"
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
         },
         {
-          "id": 2159,
-          "logprob": -13.59375,
-          "text": "request"
+          "id": 349,
+          "logprob": -1.1503906,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.5859375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.3945312,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.4555664,
+          "text": "?"
         }
       ],
       "seed": null,
       "tokens": [
         {
           "id": 13,
-          "logprob": -1.7089844,
+          "logprob": -0.6953125,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.68847656,
+          "logprob": -0.4777832,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 28771,
-          "logprob": -1.9394531,
+          "id": 23229,
+          "logprob": -0.13232422,
           "special": false,
-          "text": "#"
+          "text": "Deep"
         },
         {
-          "id": 3735,
-          "logprob": -2.8828125,
+          "id": 5168,
+          "logprob": -0.023834229,
           "special": false,
-          "text": " Test"
+          "text": " learning"
         },
         {
-          "id": 2159,
-          "logprob": -0.37329102,
+          "id": 349,
+          "logprob": -0.13977051,
           "special": false,
-          "text": " request"
+          "text": " is"
         },
         {
-          "id": 13,
-          "logprob": -0.2602539,
+          "id": 264,
+          "logprob": -0.14416504,
           "special": false,
-          "text": "\n"
+          "text": " a"
         },
         {
-          "id": 13,
-          "logprob": -0.0017185211,
+          "id": 19804,
+          "logprob": -0.63183594,
           "special": false,
-          "text": "\n"
+          "text": " subset"
         },
         {
-          "id": 1064,
-          "logprob": -2.2753906,
+          "id": 302,
+          "logprob": -0.010223389,
           "special": false,
-          "text": "##"
+          "text": " of"
         },
         {
-          "id": 3735,
-          "logprob": -1.9316406,
+          "id": 5599,
+          "logprob": -0.064208984,
           "special": false,
-          "text": " Test"
+          "text": " machine"
         },
         {
-          "id": 2159,
-          "logprob": -0.48217773,
+          "id": 5168,
+          "logprob": -0.0028266907,
           "special": false,
-          "text": " request"
+          "text": " learning"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n\n# Test request\n\n## Test request"
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
   },
   {
     "details": {
@@ -100,82 +115,97 @@
           "text": "<s>"
         },
         {
-          "id": 3735,
-          "logprob": -11.0078125,
-          "text": "Test"
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
         },
         {
-          "id": 2159,
-          "logprob": -13.59375,
-          "text": "request"
+          "id": 349,
+          "logprob": -1.1425781,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.59375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.390625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.45532227,
+          "text": "?"
         }
       ],
       "seed": null,
       "tokens": [
         {
           "id": 13,
-          "logprob": -1.7089844,
+          "logprob": -0.6953125,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.68847656,
+          "logprob": -0.48339844,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 28771,
-          "logprob": -1.9394531,
+          "id": 23229,
+          "logprob": -0.13256836,
           "special": false,
-          "text": "#"
+          "text": "Deep"
         },
         {
-          "id": 3735,
-          "logprob": -2.8828125,
+          "id": 5168,
+          "logprob": -0.02420044,
           "special": false,
-          "text": " Test"
+          "text": " learning"
         },
         {
-          "id": 2159,
-          "logprob": -0.37329102,
+          "id": 349,
+          "logprob": -0.13977051,
           "special": false,
-          "text": " request"
+          "text": " is"
         },
         {
-          "id": 13,
-          "logprob": -0.2602539,
+          "id": 264,
+          "logprob": -0.14501953,
           "special": false,
-          "text": "\n"
+          "text": " a"
         },
         {
-          "id": 13,
-          "logprob": -0.0017185211,
+          "id": 19804,
+          "logprob": -0.63134766,
           "special": false,
-          "text": "\n"
+          "text": " subset"
         },
         {
-          "id": 1064,
-          "logprob": -2.2753906,
+          "id": 302,
+          "logprob": -0.010223389,
           "special": false,
-          "text": "##"
+          "text": " of"
         },
         {
-          "id": 3735,
-          "logprob": -1.9316406,
+          "id": 5599,
+          "logprob": -0.06427002,
           "special": false,
-          "text": " Test"
+          "text": " machine"
         },
         {
-          "id": 2159,
-          "logprob": -0.48217773,
+          "id": 5168,
+          "logprob": -0.002817154,
           "special": false,
-          "text": " request"
+          "text": " learning"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n\n# Test request\n\n## Test request"
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
   },
   {
     "details": {
@@ -189,82 +219,97 @@
           "text": "<s>"
         },
         {
-          "id": 3735,
-          "logprob": -11.0078125,
-          "text": "Test"
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
         },
         {
-          "id": 2159,
-          "logprob": -13.59375,
-          "text": "request"
+          "id": 349,
+          "logprob": -1.1425781,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.59375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.390625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.45532227,
+          "text": "?"
         }
       ],
       "seed": null,
       "tokens": [
         {
           "id": 13,
-          "logprob": -1.7089844,
+          "logprob": -0.6953125,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.68847656,
+          "logprob": -0.48339844,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 28771,
-          "logprob": -1.9394531,
+          "id": 23229,
+          "logprob": -0.13256836,
           "special": false,
-          "text": "#"
+          "text": "Deep"
         },
         {
-          "id": 3735,
-          "logprob": -2.8828125,
+          "id": 5168,
+          "logprob": -0.02420044,
           "special": false,
-          "text": " Test"
+          "text": " learning"
         },
         {
-          "id": 2159,
-          "logprob": -0.37329102,
+          "id": 349,
+          "logprob": -0.13977051,
           "special": false,
-          "text": " request"
+          "text": " is"
         },
         {
-          "id": 13,
-          "logprob": -0.2602539,
+          "id": 264,
+          "logprob": -0.14501953,
           "special": false,
-          "text": "\n"
+          "text": " a"
         },
         {
-          "id": 13,
-          "logprob": -0.0017185211,
+          "id": 19804,
+          "logprob": -0.63134766,
           "special": false,
-          "text": "\n"
+          "text": " subset"
         },
         {
-          "id": 1064,
-          "logprob": -2.2753906,
+          "id": 302,
+          "logprob": -0.010223389,
           "special": false,
-          "text": "##"
+          "text": " of"
         },
         {
-          "id": 3735,
-          "logprob": -1.9316406,
+          "id": 5599,
+          "logprob": -0.06427002,
           "special": false,
-          "text": " Test"
+          "text": " machine"
         },
         {
-          "id": 2159,
-          "logprob": -0.48217773,
+          "id": 5168,
+          "logprob": -0.002817154,
           "special": false,
-          "text": " request"
+          "text": " learning"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n\n# Test request\n\n## Test request"
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
   },
   {
     "details": {
@@ -278,81 +323,96 @@
           "text": "<s>"
         },
         {
-          "id": 3735,
-          "logprob": -11.0078125,
-          "text": "Test"
+          "id": 1824,
+          "logprob": -9.2890625,
+          "text": "What"
         },
         {
-          "id": 2159,
-          "logprob": -13.59375,
-          "text": "request"
+          "id": 349,
+          "logprob": -1.1425781,
+          "text": "is"
+        },
+        {
+          "id": 3534,
+          "logprob": -9.59375,
+          "text": "deep"
+        },
+        {
+          "id": 5168,
+          "logprob": -1.390625,
+          "text": "learning"
+        },
+        {
+          "id": 28804,
+          "logprob": -0.45532227,
+          "text": "?"
         }
       ],
       "seed": null,
       "tokens": [
         {
           "id": 13,
-          "logprob": -1.7089844,
+          "logprob": -0.6953125,
           "special": false,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.68847656,
+          "logprob": -0.48339844,
           "special": false,
           "text": "\n"
         },
         {
-          "id": 28771,
-          "logprob": -1.9394531,
+          "id": 23229,
+          "logprob": -0.13256836,
           "special": false,
-          "text": "#"
+          "text": "Deep"
         },
         {
-          "id": 3735,
-          "logprob": -2.8828125,
+          "id": 5168,
+          "logprob": -0.02420044,
           "special": false,
-          "text": " Test"
+          "text": " learning"
         },
         {
-          "id": 2159,
-          "logprob": -0.37329102,
+          "id": 349,
+          "logprob": -0.13977051,
           "special": false,
-          "text": " request"
+          "text": " is"
         },
         {
-          "id": 13,
-          "logprob": -0.2602539,
+          "id": 264,
+          "logprob": -0.14501953,
           "special": false,
-          "text": "\n"
+          "text": " a"
         },
         {
-          "id": 13,
-          "logprob": -0.0017185211,
+          "id": 19804,
+          "logprob": -0.63134766,
           "special": false,
-          "text": "\n"
+          "text": " subset"
         },
         {
-          "id": 1064,
-          "logprob": -2.2753906,
+          "id": 302,
+          "logprob": -0.010223389,
           "special": false,
-          "text": "##"
+          "text": " of"
         },
         {
-          "id": 3735,
-          "logprob": -1.9316406,
+          "id": 5599,
+          "logprob": -0.06427002,
           "special": false,
-          "text": " Test"
+          "text": " machine"
         },
         {
-          "id": 2159,
-          "logprob": -0.48217773,
+          "id": 5168,
+          "logprob": -0.002817154,
           "special": false,
-          "text": " request"
+          "text": " learning"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "\n\n# Test request\n\n## Test request"
+    "generated_text": "\n\nDeep learning is a subset of machine learning"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
index 0d6dca31de8..cfabe3c65cd 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json
@@ -11,32 +11,32 @@
       },
       {
         "id": 338,
-        "logprob": -0.7133789,
+        "logprob": -0.6201172,
         "text": "is"
       },
       {
         "id": 16030,
-        "logprob": -13.9296875,
+        "logprob": -13.6484375,
         "text": "gradient"
       },
       {
         "id": 26815,
-        "logprob": -0.048919678,
+        "logprob": -0.003894806,
         "text": "descent"
       },
       {
         "id": 29973,
-        "logprob": -3.0078125,
+        "logprob": -2.6386719,
         "text": "?"
       },
       {
         "id": 13,
-        "logprob": -2.8105469,
+        "logprob": -6.46875,
         "text": "\n"
       },
       {
         "id": 13,
-        "logprob": -0.84521484,
+        "logprob": -6.6875,
         "text": "\n"
       }
     ],
@@ -44,66 +44,66 @@
     "tokens": [
       {
         "id": 25584,
-        "logprob": -0.017028809,
+        "logprob": -0.008979797,
         "special": false,
         "text": "Grad"
       },
       {
         "id": 993,
-        "logprob": -0.0027313232,
+        "logprob": -8.34465e-07,
         "special": false,
         "text": "ient"
       },
       {
         "id": 26815,
-        "logprob": -0.023254395,
+        "logprob": -0.0009407997,
         "special": false,
         "text": " descent"
       },
       {
         "id": 338,
-        "logprob": -2.0623207e-05,
+        "logprob": -0.0003838539,
         "special": false,
         "text": " is"
       },
       {
-        "id": 263,
-        "logprob": -0.5361328,
+        "id": 385,
+        "logprob": -0.24499512,
         "special": false,
-        "text": " a"
+        "text": " an"
       },
       {
-        "id": 937,
-        "logprob": -0.17578125,
+        "id": 13883,
+        "logprob": -0.010406494,
         "special": false,
-        "text": " first"
+        "text": " optimization"
       },
       {
-        "id": 29899,
-        "logprob": 0.0,
+        "id": 5687,
+        "logprob": -0.00024354458,
         "special": false,
-        "text": "-"
+        "text": " algorithm"
       },
       {
-        "id": 2098,
-        "logprob": -0.00011539459,
+        "id": 15574,
+        "logprob": -0.6582031,
         "special": false,
-        "text": "order"
+        "text": " commonly"
       },
       {
-        "id": 13883,
-        "logprob": -0.47436523,
+        "id": 1304,
+        "logprob": -0.00092840195,
         "special": false,
-        "text": " optimization"
+        "text": " used"
       },
       {
-        "id": 5687,
-        "logprob": -0.00027680397,
+        "id": 297,
+        "logprob": -0.19470215,
         "special": false,
-        "text": " algorithm"
+        "text": " in"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "Gradient descent is a first-order optimization algorithm"
+  "generated_text": "Gradient descent is an optimization algorithm commonly used in"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
index 38b803353d2..b524859fdc7 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json
@@ -5,95 +5,95 @@
     "generated_tokens": 10,
     "prefill": [
       {
-        "id": 16030,
+        "id": 338,
         "logprob": null,
+        "text": "is"
+      },
+      {
+        "id": 16030,
+        "logprob": -13.328125,
         "text": "gradient"
       },
       {
         "id": 26815,
-        "logprob": -6.4960938,
+        "logprob": -0.24023438,
         "text": "descent"
       },
       {
         "id": 29973,
-        "logprob": -5.1484375,
+        "logprob": -3.1386719,
         "text": "?"
       },
       {
         "id": 13,
-        "logprob": -4.0351562,
-        "text": "\n"
-      },
-      {
-        "id": 13,
-        "logprob": -5.2265625,
+        "logprob": -3.0878906,
         "text": "\n"
       }
     ],
     "seed": 0,
     "tokens": [
       {
-        "id": 10994,
-        "logprob": -1.1542969,
+        "id": 25584,
+        "logprob": 0.0,
         "special": false,
-        "text": "Hello"
+        "text": "Grad"
       },
       {
-        "id": 29991,
+        "id": 993,
         "logprob": 0.0,
         "special": false,
-        "text": "!"
+        "text": "ient"
       },
       {
-        "id": 739,
+        "id": 2726,
         "logprob": 0.0,
         "special": false,
-        "text": " It"
+        "text": " Des"
       },
       {
-        "id": 2444,
-        "logprob": -0.42260742,
+        "id": 1760,
+        "logprob": 0.0,
         "special": false,
-        "text": " seems"
+        "text": "cent"
       },
       {
-        "id": 366,
-        "logprob": 0.0,
+        "id": 313,
+        "logprob": -0.12322998,
         "special": false,
-        "text": " you"
+        "text": " ("
       },
       {
-        "id": 29915,
+        "id": 29954,
         "logprob": 0.0,
         "special": false,
-        "text": "'"
+        "text": "G"
       },
       {
-        "id": 276,
-        "logprob": -0.9838867,
+        "id": 29928,
+        "logprob": 0.0,
         "special": false,
-        "text": "re"
+        "text": "D"
       },
       {
-        "id": 3211,
+        "id": 29897,
         "logprob": 0.0,
         "special": false,
-        "text": " address"
+        "text": ")"
       },
       {
-        "id": 292,
-        "logprob": 0.0,
+        "id": 338,
+        "logprob": -0.6040039,
         "special": false,
-        "text": "ing"
+        "text": " is"
       },
       {
-        "id": 263,
-        "logprob": -0.15124512,
+        "id": 385,
+        "logprob": -0.1796875,
         "special": false,
-        "text": " a"
+        "text": " an"
       }
     ],
     "top_tokens": null
   },
-  "generated_text": "What is gradient descent?\n\nHello! It seems you're addressing a"
+  "generated_text": "What is gradient descent?\nGradient Descent (GD) is an"
 }
diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
index f1f81152c46..2c977d8b2be 100644
--- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
+++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json
@@ -12,32 +12,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7133789,
+          "logprob": -0.6201172,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9296875,
+          "logprob": -13.6484375,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.048919678,
+          "logprob": -0.003894806,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6386719,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8105469,
+          "logprob": -6.46875,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.84521484,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -45,68 +45,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.017028809,
+          "logprob": -0.008979797,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0028476715,
+          "logprob": -8.34465e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023971558,
+          "logprob": -0.00097084045,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.0003838539,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.23840332,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17602539,
+          "id": 13883,
+          "logprob": -0.010406494,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.0002501011,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.000116467476,
+          "id": 15574,
+          "logprob": -0.6582031,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.47436523,
+          "id": 1304,
+          "logprob": -0.00092840195,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027871132,
+          "id": 297,
+          "logprob": -0.18933105,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -121,32 +121,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7128906,
+          "logprob": -0.6113281,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.6640625,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.05053711,
+          "logprob": -0.003929138,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0058594,
+          "logprob": -2.625,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.484375,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.84521484,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -154,68 +154,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.018859863,
+          "logprob": -0.009017944,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.002822876,
+          "logprob": -9.536743e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.00097084045,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.0003838539,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17126465,
+          "id": 13883,
+          "logprob": -0.010406494,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.0002501011,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.0001155138,
+          "id": 15574,
+          "logprob": -0.6435547,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.47436523,
+          "id": 1304,
+          "logprob": -0.0009279251,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027036667,
+          "id": 297,
+          "logprob": -0.18933105,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -230,32 +230,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.71484375,
+          "logprob": -0.609375,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.671875,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.049346924,
+          "logprob": -0.0040016174,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6230469,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.453125,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.86328125,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -263,68 +263,68 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.017196655,
+          "logprob": -0.008956909,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0028438568,
+          "logprob": -8.34465e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.0009407997,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.026558e-05,
+          "logprob": -0.0003721714,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17602539,
+          "id": 13883,
+          "logprob": -0.010406494,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.0002501011,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.00011622906,
+          "id": 15574,
+          "logprob": -0.6435547,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.48608398,
+          "id": 1304,
+          "logprob": -0.00092601776,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027894974,
+          "id": 297,
+          "logprob": -0.19177246,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   },
   {
     "details": {
@@ -339,32 +339,32 @@
         },
         {
           "id": 338,
-          "logprob": -0.7192383,
+          "logprob": -0.609375,
           "text": "is"
         },
         {
           "id": 16030,
-          "logprob": -13.9375,
+          "logprob": -13.6640625,
           "text": "gradient"
         },
         {
           "id": 26815,
-          "logprob": -0.050445557,
+          "logprob": -0.0038967133,
           "text": "descent"
         },
         {
           "id": 29973,
-          "logprob": -3.0078125,
+          "logprob": -2.6347656,
           "text": "?"
         },
         {
           "id": 13,
-          "logprob": -2.8242188,
+          "logprob": -6.453125,
           "text": "\n"
         },
         {
           "id": 13,
-          "logprob": -0.8276367,
+          "logprob": -6.6875,
           "text": "\n"
         }
       ],
@@ -372,67 +372,67 @@
       "tokens": [
         {
           "id": 25584,
-          "logprob": -0.01727295,
+          "logprob": -0.008979797,
           "special": false,
           "text": "Grad"
         },
         {
           "id": 993,
-          "logprob": -0.0027542114,
+          "logprob": -9.536743e-07,
           "special": false,
           "text": "ient"
         },
         {
           "id": 26815,
-          "logprob": -0.023254395,
+          "logprob": -0.0009407997,
           "special": false,
           "text": " descent"
         },
         {
           "id": 338,
-          "logprob": -2.0384789e-05,
+          "logprob": -0.00038409233,
           "special": false,
           "text": " is"
         },
         {
-          "id": 263,
-          "logprob": -0.5229492,
+          "id": 385,
+          "logprob": -0.24499512,
           "special": false,
-          "text": " a"
+          "text": " an"
         },
         {
-          "id": 937,
-          "logprob": -0.17126465,
+          "id": 13883,
+          "logprob": -0.010414124,
           "special": false,
-          "text": " first"
+          "text": " optimization"
         },
         {
-          "id": 29899,
-          "logprob": 0.0,
+          "id": 5687,
+          "logprob": -0.00024354458,
           "special": false,
-          "text": "-"
+          "text": " algorithm"
         },
         {
-          "id": 2098,
-          "logprob": -0.00011301041,
+          "id": 15574,
+          "logprob": -0.6435547,
           "special": false,
-          "text": "order"
+          "text": " commonly"
         },
         {
-          "id": 13883,
-          "logprob": -0.48608398,
+          "id": 1304,
+          "logprob": -0.0009279251,
           "special": false,
-          "text": " optimization"
+          "text": " used"
         },
         {
-          "id": 5687,
-          "logprob": -0.00027894974,
+          "id": 297,
+          "logprob": -0.19470215,
           "special": false,
-          "text": " algorithm"
+          "text": " in"
         }
       ],
       "top_tokens": null
     },
-    "generated_text": "Gradient descent is a first-order optimization algorithm"
+    "generated_text": "Gradient descent is an optimization algorithm commonly used in"
   }
 ]
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
index 26224118c7e..8548e376c08 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json
@@ -11,57 +11,57 @@
       },
       {
         "id": 3226,
-        "logprob": -8.9453125,
+        "logprob": -9.0234375,
         "text": " ge"
       },
       {
         "id": 21017,
-        "logprob": -8.8515625,
+        "logprob": -9.0859375,
         "text": "ometric"
       },
       {
         "id": 81,
-        "logprob": -0.21875,
+        "logprob": -0.25585938,
         "text": "_"
       },
       {
         "id": 6009,
-        "logprob": -1.2773438,
+        "logprob": -2.1972656,
         "text": "mean"
       },
       {
         "id": 26,
-        "logprob": -0.25195312,
+        "logprob": -0.2998047,
         "text": "("
       },
       {
         "id": 62,
-        "logprob": -4.8203125,
+        "logprob": -5.6445312,
         "text": "L"
       },
       {
         "id": 44,
-        "logprob": -3.7734375,
+        "logprob": -3.0839844,
         "text": ":"
       },
       {
         "id": 1682,
-        "logprob": -0.8310547,
+        "logprob": -0.6748047,
         "text": " List"
       },
       {
         "id": 77,
-        "logprob": -0.22766113,
+        "logprob": -0.3864746,
         "text": "["
       },
       {
         "id": 1808,
-        "logprob": -0.46240234,
+        "logprob": -0.9355469,
         "text": "float"
       },
       {
         "id": 10794,
-        "logprob": -3.0234375,
+        "logprob": -2.5371094,
         "text": "]):"
       }
     ],
@@ -69,7 +69,7 @@
     "tokens": [
       {
         "id": 284,
-        "logprob": -0.04626465,
+        "logprob": -1.1679688,
         "special": false,
         "text": "\n   "
       },
diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
index 015912f8b61..a6b805342aa 100644
--- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
+++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json
@@ -11,57 +11,57 @@
       },
       {
         "id": 3226,
-        "logprob": -8.9453125,
+        "logprob": -9.015625,
         "text": " ge"
       },
       {
         "id": 21017,
-        "logprob": -8.859375,
+        "logprob": -9.0859375,
         "text": "ometric"
       },
       {
         "id": 81,
-        "logprob": -0.21984863,
+        "logprob": -0.25585938,
         "text": "_"
       },
       {
         "id": 6009,
-        "logprob": -1.2861328,
+        "logprob": -2.2304688,
         "text": "mean"
       },
       {
         "id": 26,
-        "logprob": -0.25219727,
+        "logprob": -0.29760742,
         "text": "("
       },
       {
         "id": 62,
-        "logprob": -4.8007812,
+        "logprob": -5.6796875,
         "text": "L"
       },
       {
         "id": 44,
-        "logprob": -3.7949219,
+        "logprob": -3.0742188,
         "text": ":"
       },
       {
         "id": 1682,
-        "logprob": -0.8046875,
+        "logprob": -0.67626953,
         "text": " List"
       },
       {
         "id": 77,
-        "logprob": -0.22424316,
+        "logprob": -0.38842773,
         "text": "["
       },
       {
         "id": 1808,
-        "logprob": -0.46191406,
+        "logprob": -0.9165039,
         "text": "float"
       },
       {
         "id": 10794,
-        "logprob": -3.0253906,
+        "logprob": -2.5527344,
         "text": "]):"
       }
     ],
@@ -69,7 +69,7 @@
     "tokens": [
       {
         "id": 284,
-        "logprob": 0.0,
+        "logprob": -0.048583984,
         "special": false,
         "text": "\n   "
       },
diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
index 40ec7e2f667..9fd950a219a 100644
--- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
+++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json
@@ -26,7 +26,7 @@
       },
       {
         "id": 259,
-        "logprob": -0.46948242,
+        "logprob": -0.47070312,
         "special": false,
         "text": " "
       },
@@ -38,7 +38,7 @@
       },
       {
         "id": 35622,
-        "logprob": -0.79589844,
+        "logprob": -0.796875,
         "special": false,
         "text": " cloud"
       },
@@ -75,5 +75,5 @@
     ],
     "top_tokens": null
   },
-  "generated_text": "Why is the sky blue?blue sky, clouds and clouds"
+  "generated_text": "Why is the sky blue?blue sky , clouds and clouds"
 }
diff --git a/integration-tests/models/test_flash_llama_fp8_kv_cache.py b/integration-tests/models/test_flash_llama_fp8_kv_cache.py
index 05e9f0dd9ec..ccd7f78fe6f 100644
--- a/integration-tests/models/test_flash_llama_fp8_kv_cache.py
+++ b/integration-tests/models/test_flash_llama_fp8_kv_cache.py
@@ -4,7 +4,9 @@
 @pytest.fixture(scope="module")
 def flash_llama_fp8_kv_cache_handle(launcher):
     with launcher(
-        "meta-llama/Meta-Llama-3-8B", num_shard=2, kv_cache_dtype="fp8_e5m2"
+        "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+        num_shard=2,
+        kv_cache_dtype="fp8_e4m3fn",
     ) as handle:
         yield handle
 
@@ -25,7 +27,7 @@ async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, response_snaps
 
     assert (
         response.generated_text
-        == " Deep learning is a subset of machine learning that is"
+        == " Deep learning is a subset of machine learning that involves"
     )
     assert response.details.generated_tokens == 10
     assert response == response_snapshot
@@ -69,7 +71,7 @@ async def test_flash_llama_fp8_kv_cache_load(
     assert len(responses) == 4
     assert (
         responses[0].generated_text
-        == " Deep learning is a subset of machine learning that is"
+        == " Deep learning is a subset of machine learning that involves"
     )
     assert all(
         [r.generated_text == responses[0].generated_text for r in responses]
diff --git a/integration-tests/models/test_flash_mixtral_gptq.py b/integration-tests/models/test_flash_mixtral_gptq.py
index eb880628481..47bcb0bf3d7 100644
--- a/integration-tests/models/test_flash_mixtral_gptq.py
+++ b/integration-tests/models/test_flash_mixtral_gptq.py
@@ -3,7 +3,11 @@
 
 @pytest.fixture(scope="module")
 def flash_mixtral_gptq_handle(launcher):
-    with launcher("TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ", num_shard=2) as handle:
+    with launcher(
+        "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ",
+        revision="gptq-4bit-128g-actorder_True",
+        num_shard=2,
+    ) as handle:
         yield handle
 
 
@@ -16,7 +20,12 @@ async def flash_mixtral_gptq(flash_mixtral_gptq_handle):
 @pytest.mark.asyncio
 async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot):
     response = await flash_mixtral_gptq.generate(
-        "Test request", max_new_tokens=10, decoder_input_details=True
+        "What is deep learning?", max_new_tokens=10, decoder_input_details=True
+    )
+
+    assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text == "\n\nDeep learning is a subset of machine learning"
     )
 
     assert response == response_snapshot
@@ -25,7 +34,7 @@ async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot):
 @pytest.mark.asyncio
 async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, response_snapshot):
     response = await flash_mixtral_gptq.generate(
-        "Test request",
+        "What is deep learning?",
         max_new_tokens=10,
         repetition_penalty=1.2,
         return_full_text=True,
@@ -41,6 +50,10 @@ async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, response_snapsh
     )
 
     assert response.details.generated_tokens == 10
+    assert (
+        response.generated_text
+        == "What is deep learning?\nDeep Learning is a subset of Machine Learning,"
+    )
     assert response == response_snapshot
 
 
@@ -49,10 +62,14 @@ async def test_flash_mixtral_gptq_load(
     flash_mixtral_gptq, generate_load, response_snapshot
 ):
     responses = await generate_load(
-        flash_mixtral_gptq, "Test request", max_new_tokens=10, n=4
+        flash_mixtral_gptq, "What is deep learning?", max_new_tokens=10, n=4
     )
 
     assert len(responses) == 4
+    assert (
+        responses[0].generated_text
+        == "\n\nDeep learning is a subset of machine learning"
+    )
     assert all(
         [r.generated_text == responses[0].generated_text for r in responses]
     ), f"{[r.generated_text  for r in responses]}"
diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py
index 52ecaed4612..93962eb3e8f 100644
--- a/integration-tests/models/test_flash_pali_gemma.py
+++ b/integration-tests/models/test_flash_pali_gemma.py
@@ -1,5 +1,4 @@
 import pytest
-import base64
 
 
 @pytest.fixture(scope="module")
@@ -20,24 +19,11 @@ async def flash_pali_gemma(flash_pali_gemma_handle):
     return flash_pali_gemma_handle.client
 
 
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
-def get_cow_beach():
-    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
-    cow = get_cow_beach()
-    inputs = f"![]({cow})Where is the cow standing?\n"
+async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot, cow_beach):
+    inputs = f"![]({cow_beach})Where is the cow standing?\n"
     response = await flash_pali_gemma.generate(inputs, max_new_tokens=20)
 
     assert response.generated_text == "beach"
@@ -47,9 +33,9 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot):
-    chicken = get_chicken()
-    cow_beach = get_cow_beach()
+async def test_flash_pali_gemma_two_images(
+    flash_pali_gemma, response_snapshot, chicken, cow_beach
+):
     response = await flash_pali_gemma.generate(
         f"caption![]({chicken})![]({cow_beach})\n",
         max_new_tokens=20,
diff --git a/integration-tests/models/test_flash_phi35_moe.py b/integration-tests/models/test_flash_phi35_moe.py
index 2173740ad3c..d3043b028a8 100644
--- a/integration-tests/models/test_flash_phi35_moe.py
+++ b/integration-tests/models/test_flash_phi35_moe.py
@@ -25,7 +25,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "Gradient descent is a first-order optimization algorithm"
+        == "Gradient descent is an optimization algorithm commonly used in"
     )
     assert response == response_snapshot
 
@@ -33,7 +33,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot):
 @pytest.mark.asyncio
 async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
     response = await flash_phi35_moe.generate(
-        "What is gradient descent?\n\n",
+        "What is gradient descent?\n",
         max_new_tokens=10,
         repetition_penalty=1.2,
         return_full_text=True,
@@ -51,7 +51,7 @@ async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot):
     assert response.details.generated_tokens == 10
     assert (
         response.generated_text
-        == "What is gradient descent?\n\nHello! It seems you're addressing a"
+        == "What is gradient descent?\nGradient Descent (GD) is an"
     )
     assert response == response_snapshot
 
@@ -66,7 +66,7 @@ async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_sna
     assert responses[0].details.generated_tokens == 10
     assert (
         responses[0].generated_text
-        == "Gradient descent is a first-order optimization algorithm"
+        == "Gradient descent is an optimization algorithm commonly used in"
     )
     assert all(
         [r.generated_text == responses[0].generated_text for r in responses]
diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py
index 25bf9d98eff..eb3268cea4f 100644
--- a/integration-tests/models/test_grammar_response_format_llama.py
+++ b/integration-tests/models/test_grammar_response_format_llama.py
@@ -25,7 +25,6 @@ async def llama_grammar(llama_grammar_handle):
 @pytest.mark.release
 @pytest.mark.asyncio
 async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot):
-
     class Weather(BaseModel):
         unit: str
         temperature: List[int]
diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py
index eb573385b7c..e5d08bb74c6 100644
--- a/integration-tests/models/test_idefics.py
+++ b/integration-tests/models/test_idefics.py
@@ -1,5 +1,4 @@
 import pytest
-import base64
 
 
 @pytest.fixture(scope="module")
@@ -16,22 +15,8 @@ async def idefics(idefics_handle):
     return idefics_handle.client
 
 
-# TODO fix the server parsser to count inline image tokens correctly
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
-def get_cow_beach():
-    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
 @pytest.mark.asyncio
-async def test_idefics(idefics, response_snapshot):
-    chicken = get_chicken()
+async def test_idefics(idefics, response_snapshot, chicken):
     response = await idefics.generate(
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
         max_new_tokens=10,
@@ -48,9 +33,7 @@ async def test_idefics(idefics, response_snapshot):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_idefics_two_images(idefics, response_snapshot):
-    chicken = get_chicken()
-    cow_beach = get_cow_beach()
+async def test_idefics_two_images(idefics, response_snapshot, chicken, cow_beach):
     response = await idefics.generate(
         f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
         max_new_tokens=20,
@@ -63,8 +46,7 @@ async def test_idefics_two_images(idefics, response_snapshot):
 
 @pytest.mark.release
 @pytest.mark.asyncio
-async def test_idefics_load(idefics, generate_load, response_snapshot):
-    chicken = get_chicken()
+async def test_idefics_load(idefics, generate_load, response_snapshot, chicken):
     responses = await generate_load(
         idefics,
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
diff --git a/integration-tests/models/test_idefics2.py b/integration-tests/models/test_idefics2.py
index c5f48da3525..881e37f9b95 100644
--- a/integration-tests/models/test_idefics2.py
+++ b/integration-tests/models/test_idefics2.py
@@ -1,18 +1,4 @@
 import pytest
-import base64
-
-
-# TODO fix the server parsser to count inline image tokens correctly
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
-def get_cow_beach():
-    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 
 
 @pytest.fixture(scope="module")
@@ -31,8 +17,9 @@ async def flash_idefics2_next(flash_idefics2_next_handle):
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot):
-    chicken = get_chicken()
+async def test_flash_idefics2_next_simple(
+    flash_idefics2_next, response_snapshot, chicken
+):
     response = await flash_idefics2_next.generate(
         f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
         max_new_tokens=10,
@@ -46,9 +33,9 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot):
-    chicken = get_chicken()
-    cow_beach = get_cow_beach()
+async def test_flash_idefics2_two_images(
+    flash_idefics2_next, response_snapshot, chicken, cow_beach
+):
     response = await flash_idefics2_next.generate(
         f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken?<end_of_utterance> \nAssistant:",
         max_new_tokens=20,
@@ -87,9 +74,8 @@ async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snap
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_idefics2_next_load(
-    flash_idefics2_next, generate_load, response_snapshot
+    flash_idefics2_next, generate_load, response_snapshot, chicken
 ):
-    chicken = get_chicken()
     responses = await generate_load(
         flash_idefics2_next,
         f"User:![]({chicken})Write me a short story<end_of_utterance> \nAssistant:",
diff --git a/integration-tests/models/test_llava_next.py b/integration-tests/models/test_llava_next.py
index ea277d713e0..1ac8f172db7 100644
--- a/integration-tests/models/test_llava_next.py
+++ b/integration-tests/models/test_llava_next.py
@@ -1,12 +1,4 @@
 import pytest
-import base64
-
-
-# TODO fix the server parsser to count inline image tokens correctly
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
 
 
 @pytest.fixture(scope="module")
@@ -29,8 +21,7 @@ async def flash_llava_next(flash_llava_next_handle):
 @pytest.mark.release
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llava_next_simple(flash_llava_next, response_snapshot):
-    chicken = get_chicken()
+async def test_flash_llava_next_simple(flash_llava_next, response_snapshot, chicken):
     response = await flash_llava_next.generate(
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
         max_new_tokens=10,
@@ -70,9 +61,8 @@ async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot):
 @pytest.mark.asyncio
 @pytest.mark.private
 async def test_flash_llava_next_load(
-    flash_llava_next, generate_load, response_snapshot
+    flash_llava_next, generate_load, response_snapshot, chicken
 ):
-    chicken = get_chicken()
     responses = await generate_load(
         flash_llava_next,
         f"User:![]({chicken})Can you tell me a very short story based on the image?",
diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py
index 1b4264aacd1..02781707e05 100644
--- a/integration-tests/models/test_mllama.py
+++ b/integration-tests/models/test_mllama.py
@@ -1,5 +1,4 @@
 import pytest
-import base64
 import asyncio
 
 
@@ -15,22 +14,8 @@ async def mllama(mllama_handle):
     return mllama_handle.client
 
 
-# TODO fix the server parsser to count inline image tokens correctly
-def get_chicken():
-    with open("integration-tests/images/chicken_on_money.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
-def get_cow_beach():
-    with open("integration-tests/images/cow_beach.png", "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read())
-    return f"data:image/png;base64,{encoded_string.decode('utf-8')}"
-
-
 @pytest.mark.asyncio
 async def test_mllama_simpl(mllama, response_snapshot):
-    # chicken = get_chicken()
     response = await mllama.chat(
         max_tokens=10,
         temperature=0.0,
diff --git a/launcher/src/main.rs b/launcher/src/main.rs
index 42962dffb7b..71bbcbd8cd9 100644
--- a/launcher/src/main.rs
+++ b/launcher/src/main.rs
@@ -68,7 +68,7 @@ fn get_config(
 
 fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) -> (String, String) {
     let compute_capability = gpu::get_cuda_capability();
-    let mut prefix_caching: Option<String> = std::env::var("USE_PREFIX_CACHING").ok();
+    let mut prefix_caching: Option<String> = std::env::var("PREFIX_CACHING").ok();
     let mut attention: Option<String> = std::env::var("ATTENTION").ok();
     if let Some(config) = config {
         if prefix_caching.is_none() {
@@ -94,7 +94,7 @@ fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) ->
                     prefix_caching = Some("0".to_string());
                 }
                 match config.model_type.as_deref() {
-                    Some("gemma2") | Some("falcon") | Some("deepseek_v2") => {
+                    Some("falcon") | Some("deepseek_v2") => {
                         // Required because gemma2 needs bfloat16 which is not supported by
                         // flashinfer ?
                         if attention.is_none() {
@@ -124,6 +124,10 @@ fn resolve_attention(config: &Option<Config>, lora_adapters: &Option<String>) ->
             }
         }
     }
+    if attention == Some("paged".to_string()) && prefix_caching.is_none() {
+        tracing::info!("Disabling prefix caching on paged attention");
+        prefix_caching = Some("0".to_string());
+    }
 
     let attention = attention.unwrap_or("flashinfer".to_string());
     let prefix_caching = prefix_caching.unwrap_or("true".to_string());
@@ -303,6 +307,9 @@ impl std::fmt::Display for Dtype {
 
 #[derive(Clone, Copy, Debug, ValueEnum)]
 enum KVCacheDtype {
+    #[clap(name = "fp8_e4m3fn")]
+    Fp8e4m3fn,
+
     #[clap(name = "fp8_e5m2")]
     Fp8e5m2,
 }
@@ -310,6 +317,9 @@ enum KVCacheDtype {
 impl std::fmt::Display for KVCacheDtype {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
+            KVCacheDtype::Fp8e4m3fn => {
+                write!(f, "fp8_e4m3fn")
+            }
             KVCacheDtype::Fp8e5m2 => {
                 write!(f, "fp8_e5m2")
             }
@@ -420,7 +430,7 @@ struct Args {
 
     /// Specify the dtype for the key-value cache. When this option is not provided,
     /// the dtype of the model is used (typically `float16` or `bfloat16`). Currently
-    /// the only supported value is `fp8_e5m2` on CUDA.
+    /// the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA.
     #[clap(long, env, value_enum)]
     kv_cache_dtype: Option<KVCacheDtype>,
 
@@ -1094,6 +1104,8 @@ fn log_lines<R: Sized + Read>(mut bufread: BufReader<R>) {
                         }
                     }
                 }
+            } else {
+                break;
             }
         }
     }
@@ -1497,6 +1509,10 @@ fn spawn_webserver(
         router_args.push(revision.to_string())
     }
 
+    if args.trust_remote_code {
+        router_args.push("--trust-remote-code".to_string());
+    }
+
     if args.json_output {
         router_args.push("--json-output".to_string());
     }
@@ -1678,7 +1694,7 @@ fn main() -> Result<(), LauncherError> {
     };
     let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters);
     tracing::info!("Using attention {attention} - Prefix caching {prefix_caching}");
-    std::env::set_var("USE_PREFIX_CACHING", prefix_caching);
+    std::env::set_var("PREFIX_CACHING", prefix_caching);
     std::env::set_var("ATTENTION", attention);
 
     let max_input_tokens = {
@@ -1729,12 +1745,6 @@ fn main() -> Result<(), LauncherError> {
             "`max_input_tokens must be < `max_total_tokens`".to_string(),
         ));
     }
-    if max_input_tokens as u32 > max_batch_prefill_tokens {
-        return Err(LauncherError::ArgumentValidation(format!(
-            "`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {} and {}",
-            max_batch_prefill_tokens, max_input_tokens
-        )));
-    }
 
     if matches!(args.quantize, Some(Quantization::Bitsandbytes)) {
         tracing::warn!("Bitsandbytes is deprecated, use `eetq` instead, which provides better latencies overall and is drop-in in most cases.");
@@ -1788,12 +1798,6 @@ fn main() -> Result<(), LauncherError> {
     }
 
     if let Some(ref max_batch_total_tokens) = args.max_batch_total_tokens {
-        if max_batch_prefill_tokens > *max_batch_total_tokens {
-            return Err(LauncherError::ArgumentValidation(format!(
-                "`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
-                max_batch_prefill_tokens, max_batch_total_tokens
-            )));
-        }
         if max_total_tokens as u32 > *max_batch_total_tokens {
             return Err(LauncherError::ArgumentValidation(format!(
                 "`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}",
diff --git a/nix/impure-shell.nix b/nix/impure-shell.nix
index abed544a155..92e14bc3cb0 100644
--- a/nix/impure-shell.nix
+++ b/nix/impure-shell.nix
@@ -1,7 +1,12 @@
 {
+  lib,
   mkShell,
   black,
+  cmake,
   isort,
+  ninja,
+  which,
+  cudaPackages,
   openssl,
   pkg-config,
   protobuf,
@@ -11,14 +16,17 @@
   ruff,
   rust-bin,
   server,
+
+  # Enable dependencies for building CUDA packages. Useful for e.g.
+  # developing marlin/moe-kernels in-place.
+  withCuda ? false,
 }:
 
 mkShell {
-  buildInputs =
+  nativeBuildInputs =
     [
       black
       isort
-      openssl.dev
       pkg-config
       (rust-bin.stable.latest.default.override {
         extensions = [
@@ -31,6 +39,19 @@ mkShell {
       redocly
       ruff
     ]
+    ++ (lib.optionals withCuda [
+      cmake
+      ninja
+      which
+
+      # For most Torch-based extensions, setting CUDA_HOME is enough, but
+      # some custom CMake builds (e.g. vLLM) also need to have nvcc in PATH.
+      cudaPackages.cuda_nvcc
+    ]);
+  buildInputs =
+    [
+      openssl.dev
+    ]
     ++ (with python3.pkgs; [
       venvShellHook
       docker
@@ -40,10 +61,29 @@ mkShell {
       pytest
       pytest-asyncio
       syrupy
-    ]);
+    ])
+    ++ (lib.optionals withCuda (
+      with cudaPackages;
+      [
+        cuda_cccl
+        cuda_cudart
+        cuda_nvrtc
+        cuda_nvtx
+        cuda_profiler_api
+        cudnn
+        libcublas
+        libcusolver
+        libcusparse
+      ]
+    ));
 
   inputsFrom = [ server ];
 
+  env = lib.optionalAttrs withCuda {
+    CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}";
+    TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" python3.pkgs.torch.cudaCapabilities;
+  };
+
   venvDir = "./.venv";
 
   postVenvCreation = ''
@@ -51,6 +91,7 @@ mkShell {
     ( cd server ; python -m pip install --no-dependencies -e . )
     ( cd clients/python ; python -m pip install --no-dependencies -e . )
   '';
+
   postShellHook = ''
     unset SOURCE_DATE_EPOCH
     export PATH=$PATH:~/.cargo/bin
diff --git a/proto/v3/generate.proto b/proto/v3/generate.proto
index 34894bdaba4..c91e7cc43b2 100644
--- a/proto/v3/generate.proto
+++ b/proto/v3/generate.proto
@@ -34,6 +34,10 @@ message InfoResponse {
   string device_type = 3;
   optional uint32 window_size = 4;
   uint32 speculate = 5;
+  bool support_chunking = 6;
+  bool use_prefix_caching = 7;
+  string attention_impl = 8;
+  uint32 block_size = 9;
 }
 
 /// Empty request
@@ -135,10 +139,14 @@ message Request {
   repeated uint32 slots = 10;
   /// LORA adapter index
   optional string adapter_id = 11;
-  /// Prefix length that can be retrieved from the KV cache.
-  uint32 prefix_len = 12;
+  /// Tokens that can be retrieved from the KV cache.
+  /// This value is set for the first prefill and never reset
+  uint32 cache_len = 12;
   /// Context truncation
   bool add_special_tokens = 13;
+  /// Chunk of tokens that must be computed for the first prefill
+  /// This value is set for the first prefill and never reset
+  optional uint32 chunk_len = 14;
 }
 
 message Batch {
@@ -163,6 +171,8 @@ message CachedBatch {
   uint32 size = 3;
   /// Maximum number of tokens this batch will grow to
   uint32 max_tokens = 4;
+  /// Number of tokens in the next forward
+  uint32 current_tokens = 5;
 }
 
 enum FinishReason {
@@ -220,6 +230,8 @@ message FilterBatchResponse {
 message PrefillRequest {
   /// Batch
   Batch batch = 1;
+  /// Optional cached batch
+  CachedBatch cached_batch = 2;
 }
 
 message PrefillResponse {
@@ -233,6 +245,8 @@ message PrefillResponse {
   uint64 decode_ns = 4;
   /// Total elapsed time in nanoseconds
   uint64 total_ns = 5;
+  /// Concatenate elapsed time in nanoseconds
+  optional uint64 concat_ns = 6;
 }
 
 message DecodeRequest {
diff --git a/router/src/config.rs b/router/src/config.rs
index 1a20c40bb67..7139b923768 100644
--- a/router/src/config.rs
+++ b/router/src/config.rs
@@ -150,6 +150,7 @@ pub enum Config {
     Idefics2(Idefics2),
     Ssm,
     GptBigcode,
+    Granite,
     Santacoder,
     Bloom,
     Mpt,
diff --git a/router/src/lib.rs b/router/src/lib.rs
index b29c9395d91..7c40c7e3dd6 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -8,6 +8,7 @@ pub mod validation;
 mod kserve;
 pub mod logging;
 
+mod sagemaker;
 pub mod usage_stats;
 mod vertex;
 
@@ -18,45 +19,6 @@ use tracing::warn;
 use utoipa::ToSchema;
 use validation::Validation;
 
-#[derive(PartialEq)]
-pub enum Attention {
-    Paged,
-    FlashDecoding,
-    FlashInfer,
-}
-
-impl Attention {
-    pub fn block_size(&self) -> u32 {
-        match self {
-            Attention::FlashDecoding => 256,
-            Attention::FlashInfer => 1,
-            Attention::Paged => 16,
-        }
-    }
-}
-
-#[derive(Debug)]
-pub struct ParseError;
-
-impl std::fmt::Display for ParseError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Cannot parse attention value")
-    }
-}
-impl std::error::Error for ParseError {}
-
-impl std::str::FromStr for Attention {
-    type Err = ParseError;
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "paged" => Ok(Attention::Paged),
-            "flashdecoding" => Ok(Attention::FlashDecoding),
-            "flashinfer" => Ok(Attention::FlashInfer),
-            _ => Err(ParseError),
-        }
-    }
-}
-
 /// Hub type
 #[derive(Clone, Debug, Deserialize)]
 pub struct HubModelInfo {
diff --git a/router/src/main.rs.back b/router/src/main.rs.back
deleted file mode 100644
index 36879aa478b..00000000000
--- a/router/src/main.rs.back
+++ /dev/null
@@ -1,748 +0,0 @@
-use axum::http::HeaderValue;
-use clap::Parser;
-use clap::Subcommand;
-use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo};
-use hf_hub::{Cache, Repo, RepoType};
-use opentelemetry::sdk::propagation::TraceContextPropagator;
-use opentelemetry::sdk::trace;
-use opentelemetry::sdk::trace::Sampler;
-use opentelemetry::sdk::Resource;
-use opentelemetry::{global, KeyValue};
-use opentelemetry_otlp::WithExportConfig;
-use std::fs::File;
-use std::io::BufReader;
-use std::net::{IpAddr, Ipv4Addr, SocketAddr};
-use std::path::{Path, PathBuf};
-use text_generation_router::config::Config;
-use text_generation_router::usage_stats;
-use text_generation_router::{
-    server, HubModelInfo, HubPreprocessorConfig, HubProcessorConfig, HubTokenizerConfig,
-};
-use thiserror::Error;
-use tokenizers::{processors::template::TemplateProcessing, Tokenizer};
-use tower_http::cors::AllowOrigin;
-use tracing_subscriber::layer::SubscriberExt;
-use tracing_subscriber::util::SubscriberInitExt;
-use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer};
-
-/// App Configuration
-#[derive(Parser, Debug)]
-#[clap(author, version, about, long_about = None)]
-struct Args {
-    #[command(subcommand)]
-    command: Option<Commands>,
-
-    #[clap(default_value = "128", long, env)]
-    max_concurrent_requests: usize,
-    #[clap(default_value = "2", long, env)]
-    max_best_of: usize,
-    #[clap(default_value = "4", long, env)]
-    max_stop_sequences: usize,
-    #[clap(default_value = "5", long, env)]
-    max_top_n_tokens: u32,
-    #[clap(default_value = "1024", long, env)]
-    max_input_tokens: usize,
-    #[clap(default_value = "2048", long, env)]
-    max_total_tokens: usize,
-    #[clap(default_value = "1.2", long, env)]
-    waiting_served_ratio: f32,
-    #[clap(default_value = "4096", long, env)]
-    max_batch_prefill_tokens: u32,
-    #[clap(long, env)]
-    max_batch_total_tokens: Option<u32>,
-    #[clap(default_value = "20", long, env)]
-    max_waiting_tokens: usize,
-    #[clap(long, env)]
-    max_batch_size: Option<usize>,
-    #[clap(default_value = "0.0.0.0", long, env)]
-    hostname: String,
-    #[clap(default_value = "3000", long, short, env)]
-    port: u16,
-    #[clap(default_value = "/tmp/text-generation-server-0", long, env)]
-    master_shard_uds_path: String,
-    #[clap(default_value = "bigscience/bloom", long, env)]
-    tokenizer_name: String,
-    #[clap(long, env)]
-    tokenizer_config_path: Option<String>,
-    #[clap(long, env)]
-    revision: Option<String>,
-    #[clap(default_value = "2", long, env)]
-    validation_workers: usize,
-    #[clap(long, env)]
-    json_output: bool,
-    #[clap(long, env)]
-    otlp_endpoint: Option<String>,
-    #[clap(default_value = "text-generation-inference.router", long, env)]
-    otlp_service_name: String,
-    #[clap(long, env)]
-    cors_allow_origin: Option<Vec<String>>,
-    #[clap(long, env)]
-    api_key: Option<String>,
-    #[clap(long, env)]
-    ngrok: bool,
-    #[clap(long, env)]
-    ngrok_authtoken: Option<String>,
-    #[clap(long, env)]
-    ngrok_edge: Option<String>,
-    #[clap(long, env, default_value_t = false)]
-    messages_api_enabled: bool,
-    #[clap(long, env, default_value_t = false)]
-    disable_grammar_support: bool,
-    #[clap(default_value = "4", long, env)]
-    max_client_batch_size: usize,
-    #[clap(long, env, default_value_t)]
-    disable_usage_stats: bool,
-    #[clap(long, env, default_value_t)]
-    disable_crash_reports: bool,
-}
-
-#[derive(Debug, Subcommand)]
-enum Commands {
-    PrintSchema,
-}
-
-#[tokio::main]
-async fn main() -> Result<(), RouterError> {
-    let args = Args::parse();
-
-    // Pattern match configuration
-    let Args {
-        max_concurrent_requests,
-        max_best_of,
-        max_stop_sequences,
-        max_top_n_tokens,
-        max_input_tokens,
-        max_total_tokens,
-        waiting_served_ratio,
-        max_batch_prefill_tokens,
-        max_batch_total_tokens,
-        max_waiting_tokens,
-        max_batch_size,
-        hostname,
-        port,
-        master_shard_uds_path,
-        tokenizer_name,
-        tokenizer_config_path,
-        revision,
-        validation_workers,
-        json_output,
-        otlp_endpoint,
-        otlp_service_name,
-        cors_allow_origin,
-        api_key,
-        ngrok,
-        ngrok_authtoken,
-        ngrok_edge,
-        messages_api_enabled,
-        disable_grammar_support,
-        max_client_batch_size,
-        disable_usage_stats,
-        disable_crash_reports,
-        command,
-    } = args;
-
-    let print_schema_command = match command {
-        Some(Commands::PrintSchema) => true,
-        None => {
-            // only init logging if we are not running the print schema command
-            init_logging(otlp_endpoint, otlp_service_name, json_output);
-            false
-        }
-    };
-
-    // Validate args
-    if max_input_tokens >= max_total_tokens {
-        return Err(RouterError::ArgumentValidation(
-            "`max_input_tokens` must be < `max_total_tokens`".to_string(),
-        ));
-    }
-    if max_input_tokens as u32 > max_batch_prefill_tokens {
-        return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}")));
-    }
-
-    if validation_workers == 0 {
-        return Err(RouterError::ArgumentValidation(
-            "`validation_workers` must be > 0".to_string(),
-        ));
-    }
-
-    if let Some(ref max_batch_total_tokens) = max_batch_total_tokens {
-        if max_batch_prefill_tokens > *max_batch_total_tokens {
-            return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}")));
-        }
-        if max_total_tokens as u32 > *max_batch_total_tokens {
-            return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}")));
-        }
-    }
-
-    // CORS allowed origins
-    // map to go inside the option and then map to parse from String to HeaderValue
-    // Finally, convert to AllowOrigin
-    let cors_allow_origin: Option<AllowOrigin> = cors_allow_origin.map(|cors_allow_origin| {
-        AllowOrigin::list(
-            cors_allow_origin
-                .iter()
-                .map(|origin| origin.parse::<HeaderValue>().unwrap()),
-        )
-    });
-
-    // Parse Huggingface hub token
-    let authorization_token = std::env::var("HF_TOKEN")
-        .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN"))
-        .ok();
-
-    // Tokenizer instance
-    // This will only be used to validate payloads
-    let local_path = Path::new(&tokenizer_name);
-
-    // Shared API builder initialization
-    let api_builder = || {
-        let mut builder = ApiBuilder::new()
-            .with_progress(false)
-            .with_token(authorization_token);
-
-        if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") {
-            builder = builder.with_cache_dir(cache_dir.into());
-        }
-
-        builder
-    };
-
-    // Decide if we need to use the API based on the revision and local path
-    let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir();
-
-    // Initialize API if needed
-    #[derive(Clone)]
-    enum Type {
-        Api(Api),
-        Cache(Cache),
-        None,
-    }
-    let api = if use_api {
-        if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) {
-            let cache = std::env::var("HUGGINGFACE_HUB_CACHE")
-                .map_err(|_| ())
-                .map(|cache_dir| Cache::new(cache_dir.into()))
-                .unwrap_or_else(|_| Cache::default());
-
-            tracing::warn!("Offline mode active using cache defaults");
-            Type::Cache(cache)
-        } else {
-            tracing::info!("Using the Hugging Face API");
-            match api_builder().build() {
-                Ok(api) => Type::Api(api),
-                Err(_) => {
-                    tracing::warn!("Unable to build the Hugging Face API");
-                    Type::None
-                }
-            }
-        }
-    } else {
-        Type::None
-    };
-
-    // Load tokenizer and model info
-    let (
-        tokenizer_filename,
-        config_filename,
-        tokenizer_config_filename,
-        preprocessor_config_filename,
-        processor_config_filename,
-        model_info,
-    ) = match api {
-        Type::None => (
-            Some(local_path.join("tokenizer.json")),
-            Some(local_path.join("config.json")),
-            Some(local_path.join("tokenizer_config.json")),
-            Some(local_path.join("preprocessor_config.json")),
-            Some(local_path.join("processor_config.json")),
-            None,
-        ),
-        Type::Api(api) => {
-            let api_repo = api.repo(Repo::with_revision(
-                tokenizer_name.to_string(),
-                RepoType::Model,
-                revision.clone().unwrap_or_else(|| "main".to_string()),
-            ));
-
-            let tokenizer_filename = match api_repo.get("tokenizer.json").await {
-                Ok(tokenizer_filename) => Some(tokenizer_filename),
-                Err(_) => get_base_tokenizer(&api, &api_repo).await,
-            };
-            let config_filename = api_repo.get("config.json").await.ok();
-            let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok();
-            let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok();
-            let processor_config_filename = api_repo.get("processor_config.json").await.ok();
-
-            let model_info = if let Some(model_info) = get_model_info(&api_repo).await {
-                Some(model_info)
-            } else {
-                tracing::warn!("Could not retrieve model info from the Hugging Face hub.");
-                None
-            };
-            (
-                tokenizer_filename,
-                config_filename,
-                tokenizer_config_filename,
-                preprocessor_config_filename,
-                processor_config_filename,
-                model_info,
-            )
-        }
-        Type::Cache(cache) => {
-            let repo = cache.repo(Repo::with_revision(
-                tokenizer_name.to_string(),
-                RepoType::Model,
-                revision.clone().unwrap_or_else(|| "main".to_string()),
-            ));
-            (
-                repo.get("tokenizer.json"),
-                repo.get("config.json"),
-                repo.get("tokenizer_config.json"),
-                repo.get("preprocessor_config.json"),
-                repo.get("processor_config.json"),
-                None,
-            )
-        }
-    };
-    let config: Option<Config> = config_filename.and_then(|filename| {
-        std::fs::read_to_string(filename)
-            .ok()
-            .as_ref()
-            .and_then(|c| {
-                let config: Result<Config, _> = serde_json::from_str(c);
-                if let Err(err) = &config {
-                    tracing::warn!("Could not parse config {err:?}");
-                }
-                config.ok()
-            })
-    });
-    let model_info = model_info.unwrap_or_else(|| HubModelInfo {
-        model_id: tokenizer_name.to_string(),
-        sha: None,
-        pipeline_tag: None,
-    });
-
-    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
-    let tokenizer_config: Option<HubTokenizerConfig> = if let Some(filename) = tokenizer_config_path
-    {
-        HubTokenizerConfig::from_file(filename)
-    } else {
-        tokenizer_config_filename.and_then(HubTokenizerConfig::from_file)
-    };
-    let tokenizer_config = tokenizer_config.unwrap_or_else(|| {
-        tracing::warn!("Could not find tokenizer config locally and no API specified");
-        HubTokenizerConfig::default()
-    });
-    let tokenizer_class = tokenizer_config.tokenizer_class.clone();
-
-    let tokenizer: Option<Tokenizer> = tokenizer_filename.and_then(|filename| {
-        let mut tokenizer = Tokenizer::from_file(filename).ok();
-        if let Some(tokenizer) = &mut tokenizer {
-            if let Some(class) = &tokenizer_config.tokenizer_class {
-                if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{
-                    if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) {
-                        tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205");
-                        tokenizer.with_post_processor(post_processor);
-                    }
-                }
-            }
-        }
-        tokenizer
-    });
-
-    let preprocessor_config =
-        preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file);
-    let processor_config = processor_config_filename
-        .and_then(HubProcessorConfig::from_file)
-        .unwrap_or_default();
-
-    tracing::info!("Using config {config:?}");
-    if tokenizer.is_none() {
-        tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}");
-        tracing::warn!("Rust input length validation and truncation is disabled");
-    }
-
-    // if pipeline-tag == text-generation we default to return_full_text = true
-    let compat_return_full_text = match &model_info.pipeline_tag {
-        None => {
-            tracing::warn!("no pipeline tag found for model {tokenizer_name}");
-            true
-        }
-        Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation",
-    };
-
-    // Determine the server port based on the feature and environment variable.
-    let port = if cfg!(feature = "google") {
-        std::env::var("AIP_HTTP_PORT")
-            .map(|aip_http_port| aip_http_port.parse::<u16>().unwrap_or(port))
-            .unwrap_or(port)
-    } else {
-        port
-    };
-
-    let addr = match hostname.parse() {
-        Ok(ip) => SocketAddr::new(ip, port),
-        Err(_) => {
-            tracing::warn!("Invalid hostname, defaulting to 0.0.0.0");
-            SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port)
-        }
-    };
-
-    // Only send usage stats when TGI is run in container and the function returns Some
-    let is_container = matches!(usage_stats::is_container(), Ok(true));
-
-    let user_agent = if !disable_usage_stats && is_container {
-        let reduced_args = usage_stats::Args::new(
-            config.clone(),
-            tokenizer_class,
-            max_concurrent_requests,
-            max_best_of,
-            max_stop_sequences,
-            max_top_n_tokens,
-            max_input_tokens,
-            max_total_tokens,
-            waiting_served_ratio,
-            max_batch_prefill_tokens,
-            max_batch_total_tokens,
-            max_waiting_tokens,
-            max_batch_size,
-            revision,
-            validation_workers,
-            messages_api_enabled,
-            disable_grammar_support,
-            max_client_batch_size,
-            disable_usage_stats,
-            disable_crash_reports,
-        );
-        Some(usage_stats::UserAgent::new(reduced_args))
-    } else {
-        None
-    };
-
-    if let Some(ref ua) = user_agent {
-        let start_event =
-            usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None);
-        tokio::spawn(async move {
-            start_event.send().await;
-        });
-    };
-
-    // Run server
-    let result = server::run(
-        master_shard_uds_path,
-        model_info,
-        compat_return_full_text,
-        max_concurrent_requests,
-        max_best_of,
-        max_stop_sequences,
-        max_top_n_tokens,
-        max_input_tokens,
-        max_total_tokens,
-        waiting_served_ratio,
-        max_batch_prefill_tokens,
-        max_batch_total_tokens,
-        max_waiting_tokens,
-        max_batch_size,
-        tokenizer,
-        config,
-        validation_workers,
-        addr,
-        cors_allow_origin,
-        api_key,
-        ngrok,
-        ngrok_authtoken,
-        ngrok_edge,
-        tokenizer_config,
-        preprocessor_config,
-        processor_config,
-        messages_api_enabled,
-        disable_grammar_support,
-        max_client_batch_size,
-        print_schema_command,
-    )
-    .await;
-
-    match result {
-        Ok(_) => {
-            if let Some(ref ua) = user_agent {
-                let stop_event = usage_stats::UsageStatsEvent::new(
-                    ua.clone(),
-                    usage_stats::EventType::Stop,
-                    None,
-                );
-                stop_event.send().await;
-            };
-            Ok(())
-        }
-        Err(e) => {
-            if let Some(ref ua) = user_agent {
-                if !disable_crash_reports {
-                    let error_event = usage_stats::UsageStatsEvent::new(
-                        ua.clone(),
-                        usage_stats::EventType::Error,
-                        Some(e.to_string()),
-                    );
-                    error_event.send().await;
-                } else {
-                    let unknow_error_event = usage_stats::UsageStatsEvent::new(
-                        ua.clone(),
-                        usage_stats::EventType::Error,
-                        Some("unknow_error".to_string()),
-                    );
-                    unknow_error_event.send().await;
-                }
-            };
-            Err(RouterError::WebServer(e))
-        }
-    }
-}
-
-/// Init logging using env variables LOG_LEVEL and LOG_FORMAT:
-///     - otlp_endpoint is an optional URL to an Open Telemetry collector
-///     - otlp_service_name service name to appear in APM
-///     - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO)
-///     - LOG_FORMAT may be TEXT or JSON (default to TEXT)
-///     - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms)
-fn init_logging(otlp_endpoint: Option<String>, otlp_service_name: String, json_output: bool) {
-    let mut layers = Vec::new();
-
-    // STDOUT/STDERR layer
-    let ansi = std::env::var("LOG_COLORIZE") != Ok("1".to_string());
-    let fmt_layer = tracing_subscriber::fmt::layer()
-        .with_file(true)
-        .with_ansi(ansi)
-        .with_line_number(true);
-
-    let fmt_layer = match json_output {
-        true => fmt_layer.json().flatten_event(true).boxed(),
-        false => fmt_layer.boxed(),
-    };
-    layers.push(fmt_layer);
-
-    // OpenTelemetry tracing layer
-    if let Some(otlp_endpoint) = otlp_endpoint {
-        global::set_text_map_propagator(TraceContextPropagator::new());
-
-        let tracer = opentelemetry_otlp::new_pipeline()
-            .tracing()
-            .with_exporter(
-                opentelemetry_otlp::new_exporter()
-                    .tonic()
-                    .with_endpoint(otlp_endpoint),
-            )
-            .with_trace_config(
-                trace::config()
-                    .with_resource(Resource::new(vec![KeyValue::new(
-                        "service.name",
-                        otlp_service_name,
-                    )]))
-                    .with_sampler(Sampler::AlwaysOn),
-            )
-            .install_batch(opentelemetry::runtime::Tokio);
-
-        if let Ok(tracer) = tracer {
-            layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed());
-            init_tracing_opentelemetry::init_propagator().unwrap();
-        };
-    }
-
-    // Filter events with LOG_LEVEL
-    let varname = "LOG_LEVEL";
-    let env_filter = if let Ok(log_level) = std::env::var(varname) {
-        // Override to avoid simple logs to be spammed with tokio level informations
-        let log_level = match &log_level[..] {
-            "warn" => "text_generation_launcher=warn,text_generation_router=warn",
-            "info" => "text_generation_launcher=info,text_generation_router=info",
-            "debug" => "text_generation_launcher=debug,text_generation_router=debug",
-            log_level => log_level,
-        };
-        EnvFilter::builder()
-            .with_default_directive(LevelFilter::INFO.into())
-            .parse_lossy(log_level)
-    } else {
-        EnvFilter::new("info")
-    };
-
-    tracing_subscriber::registry()
-        .with(env_filter)
-        .with(layers)
-        .init();
-}
-
-/// get model info from the Huggingface Hub
-pub async fn get_model_info(api: &ApiRepo) -> Option<HubModelInfo> {
-    let response = api.info_request().send().await.ok()?;
-
-    if response.status().is_success() {
-        let hub_model_info: HubModelInfo =
-            serde_json::from_str(&response.text().await.ok()?).ok()?;
-        if let Some(sha) = &hub_model_info.sha {
-            tracing::info!(
-                "Serving revision {sha} of model {}",
-                hub_model_info.model_id
-            );
-        }
-        Some(hub_model_info)
-    } else {
-        None
-    }
-}
-
-/// get base tokenizer
-pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option<PathBuf> {
-    let config_filename = api_repo.get("config.json").await.ok()?;
-
-    // Open the file in read-only mode with buffer.
-    let file = File::open(config_filename).ok()?;
-    let reader = BufReader::new(file);
-
-    // Read the JSON contents of the file as an instance of `User`.
-    let config: serde_json::Value = serde_json::from_reader(reader).ok()?;
-
-    if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") {
-        let api_base_repo = api.repo(Repo::with_revision(
-            base_model_id.to_string(),
-            RepoType::Model,
-            "main".to_string(),
-        ));
-
-        api_base_repo.get("tokenizer.json").await.ok()
-    } else {
-        None
-    }
-}
-
-/// get tokenizer_config from the Huggingface Hub
-pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option<HubTokenizerConfig> {
-    let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?;
-
-    // Open the file in read-only mode with buffer.
-    let file = File::open(tokenizer_config_filename).ok()?;
-    let reader = BufReader::new(file);
-
-    // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'.
-    let tokenizer_config: HubTokenizerConfig = serde_json::from_reader(reader)
-        .map_err(|e| {
-            tracing::warn!("Unable to parse tokenizer config: {}", e);
-            e
-        })
-        .ok()?;
-
-    Some(tokenizer_config)
-}
-
-/// Create a post_processor for the LlamaTokenizer
-pub fn create_post_processor(
-    tokenizer: &Tokenizer,
-    tokenizer_config: &HubTokenizerConfig,
-) -> Result<TemplateProcessing, tokenizers::processors::template::TemplateProcessingBuilderError> {
-    let add_bos_token = tokenizer_config.add_bos_token.unwrap_or(true);
-    let add_eos_token = tokenizer_config.add_eos_token.unwrap_or(false);
-
-    let bos_token = tokenizer_config.bos_token.as_ref();
-    let eos_token = tokenizer_config.eos_token.as_ref();
-
-    if add_bos_token && bos_token.is_none() {
-        panic!("add_bos_token = true but bos_token is None");
-    }
-
-    if add_eos_token && eos_token.is_none() {
-        panic!("add_eos_token = true but eos_token is None");
-    }
-
-    let mut single = Vec::new();
-    let mut pair = Vec::new();
-    let mut special_tokens = Vec::new();
-
-    if add_bos_token {
-        if let Some(bos) = bos_token {
-            let bos_token_id = tokenizer
-                .token_to_id(bos.as_str())
-                .expect("Should have found the bos token id");
-            special_tokens.push((bos.as_str(), bos_token_id));
-            single.push(format!("{}:0", bos.as_str()));
-            pair.push(format!("{}:0", bos.as_str()));
-        }
-    }
-
-    single.push("$A:0".to_string());
-    pair.push("$A:0".to_string());
-
-    if add_eos_token {
-        if let Some(eos) = eos_token {
-            let eos_token_id = tokenizer
-                .token_to_id(eos.as_str())
-                .expect("Should have found the eos token id");
-            special_tokens.push((eos.as_str(), eos_token_id));
-            single.push(format!("{}:0", eos.as_str()));
-            pair.push(format!("{}:0", eos.as_str()));
-        }
-    }
-
-    if add_bos_token {
-        if let Some(bos) = bos_token {
-            pair.push(format!("{}:1", bos.as_str()));
-        }
-    }
-
-    pair.push("$B:1".to_string());
-
-    if add_eos_token {
-        if let Some(eos) = eos_token {
-            pair.push(format!("{}:1", eos.as_str()));
-        }
-    }
-
-    let post_processor = TemplateProcessing::builder()
-        .try_single(single)?
-        .try_pair(pair)?
-        .special_tokens(special_tokens)
-        .build()?;
-
-    Ok(post_processor)
-}
-
-#[derive(Debug, Error)]
-enum RouterError {
-    #[error("Argument validation error: {0}")]
-    ArgumentValidation(String),
-    #[error("WebServer error: {0}")]
-    WebServer(#[from] server::WebServerError),
-    #[error("Tokio runtime failed to start: {0}")]
-    Tokio(#[from] std::io::Error),
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    use text_generation_router::TokenizerConfigToken;
-
-    #[test]
-    fn test_create_post_processor() {
-        let tokenizer_config = HubTokenizerConfig {
-            add_bos_token: None,
-            add_eos_token: None,
-            bos_token: Some(TokenizerConfigToken::String("<s>".to_string())),
-            eos_token: Some(TokenizerConfigToken::String("</s>".to_string())),
-            chat_template: None,
-            tokenizer_class: None,
-            completion_template: None,
-        };
-
-        let tokenizer =
-            Tokenizer::from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", None).unwrap();
-        let post_processor = create_post_processor(&tokenizer, &tokenizer_config).unwrap();
-
-        let expected = TemplateProcessing::builder()
-            .try_single("<s>:0 $A:0")
-            .unwrap()
-            .try_pair("<s>:0 $A:0 <s>:1 $B:1")
-            .unwrap()
-            .special_tokens(vec![("<s>".to_string(), 1)])
-            .build()
-            .unwrap();
-
-        assert_eq!(post_processor, expected);
-    }
-}
diff --git a/router/src/sagemaker.rs b/router/src/sagemaker.rs
new file mode 100644
index 00000000000..750ef222bb5
--- /dev/null
+++ b/router/src/sagemaker.rs
@@ -0,0 +1,82 @@
+use crate::infer::Infer;
+use crate::server::{chat_completions, compat_generate, completions, ComputeType};
+use crate::{
+    ChatCompletion, ChatCompletionChunk, ChatRequest, Chunk, CompatGenerateRequest,
+    CompletionFinal, CompletionRequest, ErrorResponse, GenerateResponse, Info, StreamResponse,
+};
+use axum::extract::Extension;
+use axum::http::StatusCode;
+use axum::response::Response;
+use axum::Json;
+use serde::{Deserialize, Serialize};
+use tracing::instrument;
+use utoipa::ToSchema;
+
+#[derive(Clone, Deserialize, ToSchema)]
+#[serde(untagged)]
+pub(crate) enum SagemakerRequest {
+    Generate(CompatGenerateRequest),
+    Chat(ChatRequest),
+    Completion(CompletionRequest),
+}
+
+// Used for OpenAPI specs
+#[allow(dead_code)]
+#[derive(Serialize, ToSchema)]
+#[serde(untagged)]
+pub(crate) enum SagemakerResponse {
+    Generate(GenerateResponse),
+    Chat(ChatCompletion),
+    Completion(CompletionFinal),
+}
+
+// Used for OpenAPI specs
+#[allow(dead_code)]
+#[derive(Serialize, ToSchema)]
+#[serde(untagged)]
+pub(crate) enum SagemakerStreamResponse {
+    Generate(StreamResponse),
+    Chat(ChatCompletionChunk),
+    Completion(Chunk),
+}
+
+/// Generate tokens from Sagemaker request
+#[utoipa::path(
+post,
+tag = "Text Generation Inference",
+path = "/invocations",
+request_body = SagemakerRequest,
+responses(
+(status = 200, description = "Generated Chat Completion",
+content(
+("application/json" = SagemakerResponse),
+("text/event-stream" = SagemakerStreamResponse),
+)),
+(status = 424, description = "Generation Error", body = ErrorResponse,
+example = json ! ({"error": "Request failed during generation", "error_type": "generation"})),
+(status = 429, description = "Model is overloaded", body = ErrorResponse,
+example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})),
+(status = 422, description = "Input validation error", body = ErrorResponse,
+example = json ! ({"error": "Input validation error", "error_type": "validation"})),
+(status = 500, description = "Incomplete generation", body = ErrorResponse,
+example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})),
+)
+)]
+#[instrument(skip_all)]
+pub(crate) async fn sagemaker_compatibility(
+    default_return_full_text: Extension<bool>,
+    infer: Extension<Infer>,
+    compute_type: Extension<ComputeType>,
+    info: Extension<Info>,
+    Json(req): Json<SagemakerRequest>,
+) -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+    match req {
+        SagemakerRequest::Generate(req) => {
+            compat_generate(default_return_full_text, infer, compute_type, Json(req)).await
+        }
+        SagemakerRequest::Chat(req) => chat_completions(infer, compute_type, info, Json(req)).await,
+        SagemakerRequest::Completion(req) => {
+            completions(infer, compute_type, info, Json(req)).await
+        }
+    }
+}
diff --git a/router/src/server.rs b/router/src/server.rs
index 3a45851b2f1..bedcc7560db 100644
--- a/router/src/server.rs
+++ b/router/src/server.rs
@@ -8,6 +8,10 @@ use crate::kserve::{
     kserve_model_metadata, kserve_model_metadata_ready,
 };
 use crate::logging::trace_context_middleware;
+use crate::sagemaker::{
+    sagemaker_compatibility, SagemakerRequest, SagemakerResponse, SagemakerStreamResponse,
+    __path_sagemaker_compatibility,
+};
 use crate::validation::ValidationError;
 use crate::vertex::vertex_compatibility;
 use crate::ChatTokenizeResponse;
@@ -85,7 +89,7 @@ example = json ! ({"error": "Incomplete generation"})),
 )
 )]
 #[instrument(skip(infer, req))]
-async fn compat_generate(
+pub(crate) async fn compat_generate(
     Extension(default_return_full_text): Extension<bool>,
     infer: Extension<Infer>,
     compute_type: Extension<ComputeType>,
@@ -694,7 +698,7 @@ time_per_token,
 seed,
 )
 )]
-async fn completions(
+pub(crate) async fn completions(
     Extension(infer): Extension<Infer>,
     Extension(compute_type): Extension<ComputeType>,
     Extension(info): Extension<Info>,
@@ -1223,7 +1227,7 @@ time_per_token,
 seed,
 )
 )]
-async fn chat_completions(
+pub(crate) async fn chat_completions(
     Extension(infer): Extension<Infer>,
     Extension(compute_type): Extension<ComputeType>,
     Extension(info): Extension<Info>,
@@ -1539,11 +1543,13 @@ completions,
 tokenize,
 metrics,
 openai_get_model_info,
+sagemaker_compatibility,
 ),
 components(
 schemas(
 Info,
 CompatGenerateRequest,
+SagemakerRequest,
 GenerateRequest,
 GrammarType,
 ChatRequest,
@@ -1566,6 +1572,8 @@ ChatCompletionTopLogprob,
 ChatCompletion,
 CompletionRequest,
 CompletionComplete,
+SagemakerResponse,
+SagemakerStreamResponse,
 Chunk,
 Completion,
 CompletionFinal,
@@ -1627,13 +1635,13 @@ pub async fn run(
     tokenizer_name: String,
     tokenizer_config_path: Option<String>,
     revision: Option<String>,
+    trust_remote_code: bool,
     hostname: String,
     port: u16,
     cors_allow_origin: Option<Vec<String>>,
     ngrok: bool,
     _ngrok_authtoken: Option<String>,
     _ngrok_edge: Option<String>,
-    messages_api_enabled: bool,
     disable_grammar_support: bool,
     max_client_batch_size: usize,
     usage_stats_level: usage_stats::UsageStatsLevel,
@@ -1787,10 +1795,13 @@ pub async fn run(
             let auto = transformers.getattr("AutoTokenizer")?;
             let from_pretrained = auto.getattr("from_pretrained")?;
             let args = (tokenizer_name.to_string(),);
-            let kwargs = [(
-                "revision",
-                revision.clone().unwrap_or_else(|| "main".to_string()),
-            )]
+            let kwargs = [
+                (
+                    "revision",
+                    (revision.clone().unwrap_or_else(|| "main".to_string())).into_py(py),
+                ),
+                ("trust_remote_code", trust_remote_code.into_py(py)),
+            ]
             .into_py_dict_bound(py);
             let tokenizer = from_pretrained.call(args, Some(&kwargs))?;
             let save = tokenizer.getattr("save_pretrained")?;
@@ -1862,7 +1873,6 @@ pub async fn run(
                 // max_batch_size,
                 revision.clone(),
                 validation_workers,
-                messages_api_enabled,
                 disable_grammar_support,
                 max_client_batch_size,
                 usage_stats_level,
@@ -1904,7 +1914,6 @@ pub async fn run(
         ngrok,
         _ngrok_authtoken,
         _ngrok_edge,
-        messages_api_enabled,
         disable_grammar_support,
         max_client_batch_size,
         model_info,
@@ -1964,7 +1973,6 @@ async fn start(
     ngrok: bool,
     _ngrok_authtoken: Option<String>,
     _ngrok_edge: Option<String>,
-    messages_api_enabled: bool,
     disable_grammar_support: bool,
     max_client_batch_size: usize,
     model_info: HubModelInfo,
@@ -2279,6 +2287,7 @@ async fn start(
         .route("/v1/chat/completions", post(chat_completions))
         .route("/v1/completions", post(completions))
         .route("/vertex", post(vertex_compatibility))
+        .route("/invocations", post(sagemaker_compatibility))
         .route("/tokenize", post(tokenize));
 
     if let Some(api_key) = api_key {
@@ -2314,13 +2323,6 @@ async fn start(
         .route("/metrics", get(metrics))
         .route("/v1/models", get(openai_get_model_info));
 
-    // Conditional AWS Sagemaker route
-    let aws_sagemaker_route = if messages_api_enabled {
-        Router::new().route("/invocations", post(chat_completions)) // Use 'chat_completions' for OAI_ENABLED
-    } else {
-        Router::new().route("/invocations", post(compat_generate)) // Use 'compat_generate' otherwise
-    };
-
     let compute_type =
         ComputeType(std::env::var("COMPUTE_TYPE").unwrap_or("gpu+optimized".to_string()));
 
@@ -2328,8 +2330,7 @@ async fn start(
     let mut app = Router::new()
         .merge(swagger_ui)
         .merge(base_routes)
-        .merge(info_routes)
-        .merge(aws_sagemaker_route);
+        .merge(info_routes);
 
     #[cfg(feature = "google")]
     {
diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs
index 0282ac63492..e9d98327ef6 100644
--- a/router/src/usage_stats.rs
+++ b/router/src/usage_stats.rs
@@ -93,7 +93,6 @@ pub struct Args {
     // max_batch_size: Option<usize>,
     revision: Option<String>,
     validation_workers: usize,
-    messages_api_enabled: bool,
     disable_grammar_support: bool,
     max_client_batch_size: usize,
     usage_stats_level: UsageStatsLevel,
@@ -117,7 +116,6 @@ impl Args {
         // max_batch_size: Option<usize>,
         revision: Option<String>,
         validation_workers: usize,
-        messages_api_enabled: bool,
         disable_grammar_support: bool,
         max_client_batch_size: usize,
         usage_stats_level: UsageStatsLevel,
@@ -138,7 +136,6 @@ impl Args {
             // max_batch_size,
             revision,
             validation_workers,
-            messages_api_enabled,
             disable_grammar_support,
             max_client_batch_size,
             usage_stats_level,
diff --git a/router/src/vertex.rs b/router/src/vertex.rs
index 03aed63c0ea..4cf5f5c73e4 100644
--- a/router/src/vertex.rs
+++ b/router/src/vertex.rs
@@ -1,9 +1,6 @@
 use crate::infer::Infer;
 use crate::server::{generate_internal, ComputeType};
-use crate::{
-    ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest, GrammarType, Message,
-    StreamOptions, Tool, ToolChoice,
-};
+use crate::{ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest};
 use axum::extract::Extension;
 use axum::http::{HeaderMap, StatusCode};
 use axum::response::{IntoResponse, Response};
@@ -22,162 +19,12 @@ pub(crate) struct GenerateVertexInstance {
     pub parameters: Option<GenerateParameters>,
 }
 
-#[derive(Clone, Deserialize, ToSchema)]
-#[cfg_attr(test, derive(Debug, PartialEq))]
-pub(crate) struct VertexChat {
-    messages: Vec<Message>,
-    // Messages is ignored there.
-    #[serde(default)]
-    parameters: VertexParameters,
-}
-
-#[derive(Clone, Deserialize, ToSchema, Serialize, Default)]
-#[cfg_attr(test, derive(Debug, PartialEq))]
-pub(crate) struct VertexParameters {
-    #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")]
-    /// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API.
-    pub model: Option<String>,
-
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far,
-    /// decreasing the model's likelihood to repeat the same line verbatim.
-    #[serde(default)]
-    #[schema(example = "1.0")]
-    pub frequency_penalty: Option<f32>,
-
-    /// UNUSED
-    /// Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens
-    /// (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically,
-    /// the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model,
-    /// but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should
-    /// result in a ban or exclusive selection of the relevant token.
-    #[serde(default)]
-    pub logit_bias: Option<Vec<f32>>,
-
-    /// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each
-    /// output token returned in the content of message.
-    #[serde(default)]
-    #[schema(example = "false")]
-    pub logprobs: Option<bool>,
-
-    /// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with
-    /// an associated log probability. logprobs must be set to true if this parameter is used.
-    #[serde(default)]
-    #[schema(example = "5")]
-    pub top_logprobs: Option<u32>,
-
-    /// The maximum number of tokens that can be generated in the chat completion.
-    #[serde(default)]
-    #[schema(example = "32")]
-    pub max_tokens: Option<u32>,
-
-    /// UNUSED
-    /// How many chat completion choices to generate for each input message. Note that you will be charged based on the
-    /// number of generated tokens across all of the choices. Keep n as 1 to minimize costs.
-    #[serde(default)]
-    #[schema(nullable = true, example = "2")]
-    pub n: Option<u32>,
-
-    /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far,
-    /// increasing the model's likelihood to talk about new topics
-    #[serde(default)]
-    #[schema(nullable = true, example = 0.1)]
-    pub presence_penalty: Option<f32>,
-
-    /// Up to 4 sequences where the API will stop generating further tokens.
-    #[serde(default)]
-    #[schema(nullable = true, example = "null")]
-    pub stop: Option<Vec<String>>,
-
-    #[serde(default = "bool::default")]
-    pub stream: bool,
-
-    #[schema(nullable = true, example = 42)]
-    pub seed: Option<u64>,
-
-    /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while
-    /// lower values like 0.2 will make it more focused and deterministic.
-    ///
-    /// We generally recommend altering this or `top_p` but not both.
-    #[serde(default)]
-    #[schema(nullable = true, example = 1.0)]
-    pub temperature: Option<f32>,
-
-    /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the
-    /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.
-    #[serde(default)]
-    #[schema(nullable = true, example = 0.95)]
-    pub top_p: Option<f32>,
-
-    /// A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of
-    /// functions the model may generate JSON inputs for.
-    #[serde(default)]
-    #[schema(nullable = true, example = "null")]
-    pub tools: Option<Vec<Tool>>,
-
-    /// A prompt to be appended before the tools
-    #[serde(default)]
-    #[schema(
-        nullable = true,
-        example = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables."
-    )]
-    pub tool_prompt: Option<String>,
-
-    /// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter.
-    #[serde(default)]
-    #[schema(nullable = true, example = "null")]
-    pub tool_choice: ToolChoice,
-
-    /// Response format constraints for the generation.
-    ///
-    /// NOTE: A request can use `response_format` OR `tools` but not both.
-    #[serde(default)]
-    #[schema(nullable = true, default = "null", example = "null")]
-    pub response_format: Option<GrammarType>,
-
-    /// A guideline to be used in the chat_template
-    #[serde(default)]
-    #[schema(nullable = true, default = "null", example = "null")]
-    pub guideline: Option<String>,
-
-    /// Options for streaming response. Only set this when you set stream: true.
-    #[serde(default)]
-    #[schema(nullable = true, example = "null")]
-    pub stream_options: Option<StreamOptions>,
-}
-
-impl From<VertexChat> for ChatRequest {
-    fn from(val: VertexChat) -> Self {
-        Self {
-            messages: val.messages,
-            frequency_penalty: val.parameters.frequency_penalty,
-            guideline: val.parameters.guideline,
-            logit_bias: val.parameters.logit_bias,
-            logprobs: val.parameters.logprobs,
-            max_tokens: val.parameters.max_tokens,
-            model: val.parameters.model,
-            n: val.parameters.n,
-            presence_penalty: val.parameters.presence_penalty,
-            response_format: val.parameters.response_format,
-            seed: val.parameters.seed,
-            stop: val.parameters.stop,
-            stream_options: val.parameters.stream_options,
-            stream: val.parameters.stream,
-            temperature: val.parameters.temperature,
-            tool_choice: val.parameters.tool_choice,
-            tool_prompt: val.parameters.tool_prompt,
-            tools: val.parameters.tools,
-            top_logprobs: val.parameters.top_logprobs,
-            top_p: val.parameters.top_p,
-        }
-    }
-}
-
 #[derive(Clone, Deserialize, ToSchema)]
 #[cfg_attr(test, derive(Debug, PartialEq))]
 #[serde(untagged)]
 pub(crate) enum VertexInstance {
     Generate(GenerateVertexInstance),
-    Chat(VertexChat),
+    Chat(ChatRequest),
 }
 
 #[derive(Deserialize, ToSchema)]
@@ -263,9 +110,8 @@ pub(crate) async fn vertex_compatibility(
                 },
             },
             VertexInstance::Chat(instance) => {
-                let chat_request: ChatRequest = instance.into();
                 let (generate_request, _using_tools): (GenerateRequest, bool) =
-                    chat_request.try_into_generate(&infer)?;
+                    instance.try_into_generate(&infer)?;
                 generate_request
             }
         };
@@ -311,34 +157,14 @@ mod tests {
 
     #[test]
     fn vertex_deserialization() {
-        let string = serde_json::json!({
-
-            "messages": [{"role": "user", "content": "What's Deep Learning?"}],
-            "parameters": {
-                "max_tokens": 128,
-                "top_p": 0.95,
-                "temperature": 0.7
-            }
-        });
-
-        let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");
-
-        let string = serde_json::json!({
-            "messages": [{"role": "user", "content": "What's Deep Learning?"}],
-        });
-
-        let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize");
-
         let string = serde_json::json!({
 
         "instances": [
             {
                 "messages": [{"role": "user", "content": "What's Deep Learning?"}],
-                "parameters": {
-                    "max_tokens": 128,
-                    "top_p": 0.95,
-                    "temperature": 0.7
-                }
+                "max_tokens": 128,
+                "top_p": 0.95,
+                "temperature": 0.7
             }
         ]
 
@@ -347,18 +173,16 @@ mod tests {
         assert_eq!(
             request,
             VertexRequest {
-                instances: vec![VertexInstance::Chat(VertexChat {
+                instances: vec![VertexInstance::Chat(ChatRequest {
                     messages: vec![Message {
                         role: "user".to_string(),
                         content: MessageContent::SingleText("What's Deep Learning?".to_string()),
                         name: None,
                     },],
-                    parameters: VertexParameters {
-                        max_tokens: Some(128),
-                        top_p: Some(0.95),
-                        temperature: Some(0.7),
-                        ..Default::default()
-                    }
+                    max_tokens: Some(128),
+                    top_p: Some(0.95),
+                    temperature: Some(0.7),
+                    ..Default::default()
                 })]
             }
         );
diff --git a/server/Makefile b/server/Makefile
index 9338b299090..18424dd6d7e 100644
--- a/server/Makefile
+++ b/server/Makefile
@@ -31,7 +31,7 @@ install: install-cuda
 	echo "Installed server"
 
 install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm
-	pip install -e ".[bnb]"
+	pip install -e ".[bnb,marlin,moe]"
 	pip install nvidia-nccl-cu12==2.22.3
 
 install-rocm: install-server install-flash-attention-v2-rocm  install-vllm-rocm
diff --git a/server/poetry.lock b/server/poetry.lock
index 08f74999f04..1293e883656 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
 
 [[package]]
 name = "accelerate"
@@ -32,113 +32,113 @@ testing = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "parameterized",
 
 [[package]]
 name = "aiohappyeyeballs"
-version = "2.4.0"
+version = "2.4.3"
 description = "Happy Eyeballs for asyncio"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "aiohappyeyeballs-2.4.0-py3-none-any.whl", hash = "sha256:7ce92076e249169a13c2f49320d1967425eaf1f407522d707d59cac7628d62bd"},
-    {file = "aiohappyeyeballs-2.4.0.tar.gz", hash = "sha256:55a1714f084e63d49639800f95716da97a1f173d46a16dfcfda0016abb93b6b2"},
+    {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"},
+    {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"},
 ]
 
 [[package]]
 name = "aiohttp"
-version = "3.10.6"
+version = "3.10.10"
 description = "Async http client/server framework (asyncio)"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.10.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:682836fc672972cc3101cc9e30d49c5f7e8f1d010478d46119fe725a4545acfd"},
-    {file = "aiohttp-3.10.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:289fa8a20018d0d5aa9e4b35d899bd51bcb80f0d5f365d9a23e30dac3b79159b"},
-    {file = "aiohttp-3.10.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8617c96a20dd57e7e9d398ff9d04f3d11c4d28b1767273a5b1a018ada5a654d3"},
-    {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdbeff1b062751c2a2a55b171f7050fb7073633c699299d042e962aacdbe1a07"},
-    {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ea35d849cdd4a9268f910bff4497baebbc1aa3f2f625fd8ccd9ac99c860c621"},
-    {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:473961b3252f3b949bb84873d6e268fb6d8aa0ccc6eb7404fa58c76a326bb8e1"},
-    {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d2665c5df629eb2f981dab244c01bfa6cdc185f4ffa026639286c4d56fafb54"},
-    {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25d92f794f1332f656e3765841fc2b7ad5c26c3f3d01e8949eeb3495691cf9f4"},
-    {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9bd6b2033993d5ae80883bb29b83fb2b432270bbe067c2f53cc73bb57c46065f"},
-    {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d7f408c43f5e75ea1edc152fb375e8f46ef916f545fb66d4aebcbcfad05e2796"},
-    {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:cf8b8560aa965f87bf9c13bf9fed7025993a155ca0ce8422da74bf46d18c2f5f"},
-    {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14477c4e52e2f17437b99893fd220ffe7d7ee41df5ebf931a92b8ca82e6fd094"},
-    {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fb138fbf9f53928e779650f5ed26d0ea1ed8b2cab67f0ea5d63afa09fdc07593"},
-    {file = "aiohttp-3.10.6-cp310-cp310-win32.whl", hash = "sha256:9843d683b8756971797be171ead21511d2215a2d6e3c899c6e3107fbbe826791"},
-    {file = "aiohttp-3.10.6-cp310-cp310-win_amd64.whl", hash = "sha256:f8b8e49fe02f744d38352daca1dbef462c3874900bd8166516f6ea8e82b5aacf"},
-    {file = "aiohttp-3.10.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f52e54fd776ad0da1006708762213b079b154644db54bcfc62f06eaa5b896402"},
-    {file = "aiohttp-3.10.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:995ab1a238fd0d19dc65f2d222e5eb064e409665c6426a3e51d5101c1979ee84"},
-    {file = "aiohttp-3.10.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0749c4d5a08a802dd66ecdf59b2df4d76b900004017468a7bb736c3b5a3dd902"},
-    {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e05b39158f2af0e2438cc2075cfc271f4ace0c3cc4a81ec95b27a0432e161951"},
-    {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a9f196c970db2dcde4f24317e06615363349dc357cf4d7a3b0716c20ac6d7bcd"},
-    {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:47647c8af04a70e07a2462931b0eba63146a13affa697afb4ecbab9d03a480ce"},
-    {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c0efe7e99f6d94d63274c06344bd0e9c8daf184ce5602a29bc39e00a18720"},
-    {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9721cdd83a994225352ca84cd537760d41a9da3c0eacb3ff534747ab8fba6d0"},
-    {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0b82c8ebed66ce182893e7c0b6b60ba2ace45b1df104feb52380edae266a4850"},
-    {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b169f8e755e541b72e714b89a831b315bbe70db44e33fead28516c9e13d5f931"},
-    {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0be3115753baf8b4153e64f9aa7bf6c0c64af57979aa900c31f496301b374570"},
-    {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e1f80cd17d81a404b6e70ef22bfe1870bafc511728397634ad5f5efc8698df56"},
-    {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6419728b08fb6380c66a470d2319cafcec554c81780e2114b7e150329b9a9a7f"},
-    {file = "aiohttp-3.10.6-cp311-cp311-win32.whl", hash = "sha256:bd294dcdc1afdc510bb51d35444003f14e327572877d016d576ac3b9a5888a27"},
-    {file = "aiohttp-3.10.6-cp311-cp311-win_amd64.whl", hash = "sha256:bf861da9a43d282d6dd9dcd64c23a0fccf2c5aa5cd7c32024513c8c79fb69de3"},
-    {file = "aiohttp-3.10.6-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:2708baccdc62f4b1251e59c2aac725936a900081f079b88843dabcab0feeeb27"},
-    {file = "aiohttp-3.10.6-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7475da7a5e2ccf1a1c86c8fee241e277f4874c96564d06f726d8df8e77683ef7"},
-    {file = "aiohttp-3.10.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:02108326574ff60267b7b35b17ac5c0bbd0008ccb942ce4c48b657bb90f0b8aa"},
-    {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:029a019627b37fa9eac5c75cc54a6bb722c4ebbf5a54d8c8c0fb4dd8facf2702"},
-    {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a637d387db6fdad95e293fab5433b775fd104ae6348d2388beaaa60d08b38c4"},
-    {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1a16f3fc1944c61290d33c88dc3f09ba62d159b284c38c5331868425aca426"},
-    {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81b292f37969f9cc54f4643f0be7dacabf3612b3b4a65413661cf6c350226787"},
-    {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0754690a3a26e819173a34093798c155bafb21c3c640bff13be1afa1e9d421f9"},
-    {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:164ecd32e65467d86843dbb121a6666c3deb23b460e3f8aefdcaacae79eb718a"},
-    {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438c5863feb761f7ca3270d48c292c334814459f61cc12bab5ba5b702d7c9e56"},
-    {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ba18573bb1de1063d222f41de64a0d3741223982dcea863b3f74646faf618ec7"},
-    {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:c82a94ddec996413a905f622f3da02c4359952aab8d817c01cf9915419525e95"},
-    {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92351aa5363fc3c1f872ca763f86730ced32b01607f0c9662b1fa711087968d0"},
-    {file = "aiohttp-3.10.6-cp312-cp312-win32.whl", hash = "sha256:3e15e33bfc73fa97c228f72e05e8795e163a693fd5323549f49367c76a6e5883"},
-    {file = "aiohttp-3.10.6-cp312-cp312-win_amd64.whl", hash = "sha256:fe517113fe4d35d9072b826c3e147d63c5f808ca8167d450b4f96c520c8a1d8d"},
-    {file = "aiohttp-3.10.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:482f74057ea13d387a7549d7a7ecb60e45146d15f3e58a2d93a0ad2d5a8457cd"},
-    {file = "aiohttp-3.10.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:03fa40d1450ee5196e843315ddf74a51afc7e83d489dbfc380eecefea74158b1"},
-    {file = "aiohttp-3.10.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e52e59ed5f4cc3a3acfe2a610f8891f216f486de54d95d6600a2c9ba1581f4d"},
-    {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b3935a22c9e41a8000d90588bed96cf395ef572dbb409be44c6219c61d900d"},
-    {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bef1480ee50f75abcfcb4b11c12de1005968ca9d0172aec4a5057ba9f2b644f"},
-    {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:671745ea7db19693ce867359d503772177f0b20fa8f6ee1e74e00449f4c4151d"},
-    {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b50b367308ca8c12e0b50cba5773bc9abe64c428d3fd2bbf5cd25aab37c77bf"},
-    {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a504d7cdb431a777d05a124fd0b21efb94498efa743103ea01b1e3136d2e4fb"},
-    {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:66bc81361131763660b969132a22edce2c4d184978ba39614e8f8f95db5c95f8"},
-    {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:27cf19a38506e2e9f12fc17e55f118f04897b0a78537055d93a9de4bf3022e3d"},
-    {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3468b39f977a11271517c6925b226720e148311039a380cc9117b1e2258a721f"},
-    {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:9d26da22a793dfd424be1050712a70c0afd96345245c29aced1e35dbace03413"},
-    {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:844d48ff9173d0b941abed8b2ea6a412f82b56d9ab1edb918c74000c15839362"},
-    {file = "aiohttp-3.10.6-cp313-cp313-win32.whl", hash = "sha256:2dd56e3c43660ed3bea67fd4c5025f1ac1f9ecf6f0b991a6e5efe2e678c490c5"},
-    {file = "aiohttp-3.10.6-cp313-cp313-win_amd64.whl", hash = "sha256:c91781d969fbced1993537f45efe1213bd6fccb4b37bfae2a026e20d6fbed206"},
-    {file = "aiohttp-3.10.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4407a80bca3e694f2d2a523058e20e1f9f98a416619e04f6dc09dc910352ac8b"},
-    {file = "aiohttp-3.10.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1cb045ec5961f51af3e2c08cd6fe523f07cc6e345033adee711c49b7b91bb954"},
-    {file = "aiohttp-3.10.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4fabdcdc781a36b8fd7b2ca9dea8172f29a99e11d00ca0f83ffeb50958da84a1"},
-    {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79a9f42efcc2681790595ab3d03c0e52d01edc23a0973ea09f0dc8d295e12b8e"},
-    {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cca776a440795db437d82c07455761c85bbcf3956221c3c23b8c93176c278ce7"},
-    {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5582de171f0898139cf51dd9fcdc79b848e28d9abd68e837f0803fc9f30807b1"},
-    {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370e2d47575c53c817ee42a18acc34aad8da4dbdaac0a6c836d58878955f1477"},
-    {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:444d1704e2af6b30766debed9be8a795958029e552fe77551355badb1944012c"},
-    {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40271a2a375812967401c9ca8077de9368e09a43a964f4dce0ff603301ec9358"},
-    {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f3af26f86863fad12e25395805bb0babbd49d512806af91ec9708a272b696248"},
-    {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4752df44df48fd42b80f51d6a97553b482cda1274d9dc5df214a3a1aa5d8f018"},
-    {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:2cd5290ab66cfca2f90045db2cc6434c1f4f9fbf97c9f1c316e785033782e7d2"},
-    {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:3427031064b0d5c95647e6369c4aa3c556402f324a3e18107cb09517abe5f962"},
-    {file = "aiohttp-3.10.6-cp38-cp38-win32.whl", hash = "sha256:614fc21e86adc28e4165a6391f851a6da6e9cbd7bb232d0df7718b453a89ee98"},
-    {file = "aiohttp-3.10.6-cp38-cp38-win_amd64.whl", hash = "sha256:58c5d7318a136a3874c78717dd6de57519bc64f6363c5827c2b1cb775bea71dd"},
-    {file = "aiohttp-3.10.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5db26bbca8e7968c4c977a0c640e0b9ce7224e1f4dcafa57870dc6ee28e27de6"},
-    {file = "aiohttp-3.10.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3fb4216e3ec0dbc01db5ba802f02ed78ad8f07121be54eb9e918448cc3f61b7c"},
-    {file = "aiohttp-3.10.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a976ef488f26e224079deb3d424f29144c6d5ba4ded313198169a8af8f47fb82"},
-    {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a86610174de8a85a920e956e2d4f9945e7da89f29a00e95ac62a4a414c4ef4e"},
-    {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:217791c6a399cc4f2e6577bb44344cba1f5714a2aebf6a0bea04cfa956658284"},
-    {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba3662d41abe2eab0eeec7ee56f33ef4e0b34858f38abf24377687f9e1fb00a5"},
-    {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4dfa5ad4bce9ca30a76117fbaa1c1decf41ebb6c18a4e098df44298941566f9"},
-    {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0009258e97502936d3bd5bf2ced15769629097d0abb81e6495fba1047824fe0"},
-    {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0a75d5c9fb4f06c41d029ae70ad943c3a844c40c0a769d12be4b99b04f473d3d"},
-    {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8198b7c002aae2b40b2d16bfe724b9a90bcbc9b78b2566fc96131ef4e382574d"},
-    {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:4611db8c907f90fe86be112efdc2398cd7b4c8eeded5a4f0314b70fdea8feab0"},
-    {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ff99ae06eef85c7a565854826114ced72765832ee16c7e3e766c5e4c5b98d20e"},
-    {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7641920bdcc7cd2d3ddfb8bb9133a6c9536b09dbd49490b79e125180b2d25b93"},
-    {file = "aiohttp-3.10.6-cp39-cp39-win32.whl", hash = "sha256:e2e7d5591ea868d5ec82b90bbeb366a198715672841d46281b623e23079593db"},
-    {file = "aiohttp-3.10.6-cp39-cp39-win_amd64.whl", hash = "sha256:b504c08c45623bf5c7ca41be380156d925f00199b3970efd758aef4a77645feb"},
-    {file = "aiohttp-3.10.6.tar.gz", hash = "sha256:d2578ef941be0c2ba58f6f421a703527d08427237ed45ecb091fed6f83305336"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"},
+    {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"},
+    {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"},
+    {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"},
+    {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"},
+    {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"},
+    {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"},
+    {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"},
+    {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"},
+    {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"},
+    {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"},
+    {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"},
+    {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"},
+    {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"},
+    {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"},
+    {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"},
+    {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"},
+    {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"},
+    {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"},
+    {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"},
+    {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"},
+    {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"},
+    {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"},
+    {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"},
+    {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"},
+    {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"},
+    {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"},
+    {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"},
+    {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"},
+    {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"},
+    {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"},
+    {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"},
 ]
 
 [package.dependencies]
@@ -240,101 +240,116 @@ files = [
 
 [[package]]
 name = "charset-normalizer"
-version = "3.3.2"
+version = "3.4.0"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 optional = false
 python-versions = ">=3.7.0"
 files = [
-    {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
-    {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
-    {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
-    {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
-    {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
-    {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
-    {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
-    {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"},
+    {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"},
+    {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"},
+    {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"},
+    {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"},
+    {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"},
+    {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"},
+    {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"},
+    {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"},
+    {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"},
 ]
 
 [[package]]
@@ -353,13 +368,13 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""}
 
 [[package]]
 name = "cloudpickle"
-version = "3.0.0"
+version = "3.1.0"
 description = "Pickler class to extend the standard pickle.Pickler functionality"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "cloudpickle-3.0.0-py3-none-any.whl", hash = "sha256:246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7"},
-    {file = "cloudpickle-3.0.0.tar.gz", hash = "sha256:996d9a482c6fb4f33c1a35335cf8afd065d2a56e973270364840712d9131a882"},
+    {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"},
+    {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"},
 ]
 
 [[package]]
@@ -665,61 +680,70 @@ testing = ["protobuf (>=4.21.9)"]
 
 [[package]]
 name = "grpcio"
-version = "1.66.1"
+version = "1.67.0"
 description = "HTTP/2-based RPC framework"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "grpcio-1.66.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:4877ba180591acdf127afe21ec1c7ff8a5ecf0fe2600f0d3c50e8c4a1cbc6492"},
-    {file = "grpcio-1.66.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:3750c5a00bd644c75f4507f77a804d0189d97a107eb1481945a0cf3af3e7a5ac"},
-    {file = "grpcio-1.66.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:a013c5fbb12bfb5f927444b477a26f1080755a931d5d362e6a9a720ca7dbae60"},
-    {file = "grpcio-1.66.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b1b24c23d51a1e8790b25514157d43f0a4dce1ac12b3f0b8e9f66a5e2c4c132f"},
-    {file = "grpcio-1.66.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7ffb8ea674d68de4cac6f57d2498fef477cef582f1fa849e9f844863af50083"},
-    {file = "grpcio-1.66.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:307b1d538140f19ccbd3aed7a93d8f71103c5d525f3c96f8616111614b14bf2a"},
-    {file = "grpcio-1.66.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1c17ebcec157cfb8dd445890a03e20caf6209a5bd4ac5b040ae9dbc59eef091d"},
-    {file = "grpcio-1.66.1-cp310-cp310-win32.whl", hash = "sha256:ef82d361ed5849d34cf09105d00b94b6728d289d6b9235513cb2fcc79f7c432c"},
-    {file = "grpcio-1.66.1-cp310-cp310-win_amd64.whl", hash = "sha256:292a846b92cdcd40ecca46e694997dd6b9be6c4c01a94a0dfb3fcb75d20da858"},
-    {file = "grpcio-1.66.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:c30aeceeaff11cd5ddbc348f37c58bcb96da8d5aa93fed78ab329de5f37a0d7a"},
-    {file = "grpcio-1.66.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8a1e224ce6f740dbb6b24c58f885422deebd7eb724aff0671a847f8951857c26"},
-    {file = "grpcio-1.66.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:a66fe4dc35d2330c185cfbb42959f57ad36f257e0cc4557d11d9f0a3f14311df"},
-    {file = "grpcio-1.66.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ba04659e4fce609de2658fe4dbf7d6ed21987a94460f5f92df7579fd5d0e22"},
-    {file = "grpcio-1.66.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4573608e23f7e091acfbe3e84ac2045680b69751d8d67685ffa193a4429fedb1"},
-    {file = "grpcio-1.66.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7e06aa1f764ec8265b19d8f00140b8c4b6ca179a6dc67aa9413867c47e1fb04e"},
-    {file = "grpcio-1.66.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3885f037eb11f1cacc41f207b705f38a44b69478086f40608959bf5ad85826dd"},
-    {file = "grpcio-1.66.1-cp311-cp311-win32.whl", hash = "sha256:97ae7edd3f3f91480e48ede5d3e7d431ad6005bfdbd65c1b56913799ec79e791"},
-    {file = "grpcio-1.66.1-cp311-cp311-win_amd64.whl", hash = "sha256:cfd349de4158d797db2bd82d2020554a121674e98fbe6b15328456b3bf2495bb"},
-    {file = "grpcio-1.66.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:a92c4f58c01c77205df6ff999faa008540475c39b835277fb8883b11cada127a"},
-    {file = "grpcio-1.66.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:fdb14bad0835914f325349ed34a51940bc2ad965142eb3090081593c6e347be9"},
-    {file = "grpcio-1.66.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f03a5884c56256e08fd9e262e11b5cfacf1af96e2ce78dc095d2c41ccae2c80d"},
-    {file = "grpcio-1.66.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ca2559692d8e7e245d456877a85ee41525f3ed425aa97eb7a70fc9a79df91a0"},
-    {file = "grpcio-1.66.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84ca1be089fb4446490dd1135828bd42a7c7f8421e74fa581611f7afdf7ab761"},
-    {file = "grpcio-1.66.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:d639c939ad7c440c7b2819a28d559179a4508783f7e5b991166f8d7a34b52815"},
-    {file = "grpcio-1.66.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:b9feb4e5ec8dc2d15709f4d5fc367794d69277f5d680baf1910fc9915c633524"},
-    {file = "grpcio-1.66.1-cp312-cp312-win32.whl", hash = "sha256:7101db1bd4cd9b880294dec41a93fcdce465bdbb602cd8dc5bd2d6362b618759"},
-    {file = "grpcio-1.66.1-cp312-cp312-win_amd64.whl", hash = "sha256:b0aa03d240b5539648d996cc60438f128c7f46050989e35b25f5c18286c86734"},
-    {file = "grpcio-1.66.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:ecfe735e7a59e5a98208447293ff8580e9db1e890e232b8b292dc8bd15afc0d2"},
-    {file = "grpcio-1.66.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4825a3aa5648010842e1c9d35a082187746aa0cdbf1b7a2a930595a94fb10fce"},
-    {file = "grpcio-1.66.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f517fd7259fe823ef3bd21e508b653d5492e706e9f0ef82c16ce3347a8a5620c"},
-    {file = "grpcio-1.66.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1fe60d0772831d96d263b53d83fb9a3d050a94b0e94b6d004a5ad111faa5b5b"},
-    {file = "grpcio-1.66.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31a049daa428f928f21090403e5d18ea02670e3d5d172581670be006100db9ef"},
-    {file = "grpcio-1.66.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6f914386e52cbdeb5d2a7ce3bf1fdfacbe9d818dd81b6099a05b741aaf3848bb"},
-    {file = "grpcio-1.66.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bff2096bdba686019fb32d2dde45b95981f0d1490e054400f70fc9a8af34b49d"},
-    {file = "grpcio-1.66.1-cp38-cp38-win32.whl", hash = "sha256:aa8ba945c96e73de29d25331b26f3e416e0c0f621e984a3ebdb2d0d0b596a3b3"},
-    {file = "grpcio-1.66.1-cp38-cp38-win_amd64.whl", hash = "sha256:161d5c535c2bdf61b95080e7f0f017a1dfcb812bf54093e71e5562b16225b4ce"},
-    {file = "grpcio-1.66.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:d0cd7050397b3609ea51727b1811e663ffda8bda39c6a5bb69525ef12414b503"},
-    {file = "grpcio-1.66.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0e6c9b42ded5d02b6b1fea3a25f036a2236eeb75d0579bfd43c0018c88bf0a3e"},
-    {file = "grpcio-1.66.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:c9f80f9fad93a8cf71c7f161778ba47fd730d13a343a46258065c4deb4b550c0"},
-    {file = "grpcio-1.66.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dd67ed9da78e5121efc5c510f0122a972216808d6de70953a740560c572eb44"},
-    {file = "grpcio-1.66.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48b0d92d45ce3be2084b92fb5bae2f64c208fea8ceed7fccf6a7b524d3c4942e"},
-    {file = "grpcio-1.66.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d813316d1a752be6f5c4360c49f55b06d4fe212d7df03253dfdae90c8a402bb"},
-    {file = "grpcio-1.66.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9c9bebc6627873ec27a70fc800f6083a13c70b23a5564788754b9ee52c5aef6c"},
-    {file = "grpcio-1.66.1-cp39-cp39-win32.whl", hash = "sha256:30a1c2cf9390c894c90bbc70147f2372130ad189cffef161f0432d0157973f45"},
-    {file = "grpcio-1.66.1-cp39-cp39-win_amd64.whl", hash = "sha256:17663598aadbedc3cacd7bbde432f541c8e07d2496564e22b214b22c7523dac8"},
-    {file = "grpcio-1.66.1.tar.gz", hash = "sha256:35334f9c9745add3e357e3372756fd32d925bd52c41da97f4dfdafbde0bf0ee2"},
+    {file = "grpcio-1.67.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:bd79929b3bb96b54df1296cd3bf4d2b770bd1df6c2bdf549b49bab286b925cdc"},
+    {file = "grpcio-1.67.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:16724ffc956ea42967f5758c2f043faef43cb7e48a51948ab593570570d1e68b"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2b7183c80b602b0ad816315d66f2fb7887614ead950416d60913a9a71c12560d"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efe32b45dd6d118f5ea2e5deaed417d8a14976325c93812dd831908522b402c9"},
+    {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe89295219b9c9e47780a0f1c75ca44211e706d1c598242249fe717af3385ec8"},
+    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa8d025fae1595a207b4e47c2e087cb88d47008494db258ac561c00877d4c8f8"},
+    {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f95e15db43e75a534420e04822df91f645664bf4ad21dfaad7d51773c80e6bb4"},
+    {file = "grpcio-1.67.0-cp310-cp310-win32.whl", hash = "sha256:a6b9a5c18863fd4b6624a42e2712103fb0f57799a3b29651c0e5b8119a519d65"},
+    {file = "grpcio-1.67.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6eb68493a05d38b426604e1dc93bfc0137c4157f7ab4fac5771fd9a104bbaa6"},
+    {file = "grpcio-1.67.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:e91d154689639932305b6ea6f45c6e46bb51ecc8ea77c10ef25aa77f75443ad4"},
+    {file = "grpcio-1.67.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cb204a742997277da678611a809a8409657b1398aaeebf73b3d9563b7d154c13"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:ae6de510f670137e755eb2a74b04d1041e7210af2444103c8c95f193340d17ee"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74b900566bdf68241118f2918d312d3bf554b2ce0b12b90178091ea7d0a17b3d"},
+    {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e95e43447a02aa603abcc6b5e727d093d161a869c83b073f50b9390ecf0fa8"},
+    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0bb94e66cd8f0baf29bd3184b6aa09aeb1a660f9ec3d85da615c5003154bc2bf"},
+    {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:82e5bd4b67b17c8c597273663794a6a46a45e44165b960517fe6d8a2f7f16d23"},
+    {file = "grpcio-1.67.0-cp311-cp311-win32.whl", hash = "sha256:7fc1d2b9fd549264ae585026b266ac2db53735510a207381be509c315b4af4e8"},
+    {file = "grpcio-1.67.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac11ecb34a86b831239cc38245403a8de25037b448464f95c3315819e7519772"},
+    {file = "grpcio-1.67.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:227316b5631260e0bef8a3ce04fa7db4cc81756fea1258b007950b6efc90c05d"},
+    {file = "grpcio-1.67.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d90cfdafcf4b45a7a076e3e2a58e7bc3d59c698c4f6470b0bb13a4d869cf2273"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:77196216d5dd6f99af1c51e235af2dd339159f657280e65ce7e12c1a8feffd1d"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15c05a26a0f7047f720da41dc49406b395c1470eef44ff7e2c506a47ac2c0591"},
+    {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3840994689cc8cbb73d60485c594424ad8adb56c71a30d8948d6453083624b52"},
+    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5a1e03c3102b6451028d5dc9f8591131d6ab3c8a0e023d94c28cb930ed4b5f81"},
+    {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:682968427a63d898759474e3b3178d42546e878fdce034fd7474ef75143b64e3"},
+    {file = "grpcio-1.67.0-cp312-cp312-win32.whl", hash = "sha256:d01793653248f49cf47e5695e0a79805b1d9d4eacef85b310118ba1dfcd1b955"},
+    {file = "grpcio-1.67.0-cp312-cp312-win_amd64.whl", hash = "sha256:985b2686f786f3e20326c4367eebdaed3e7aa65848260ff0c6644f817042cb15"},
+    {file = "grpcio-1.67.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c9a35b8bc50db35ab8e3e02a4f2a35cfba46c8705c3911c34ce343bd777813a"},
+    {file = "grpcio-1.67.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:42199e704095b62688998c2d84c89e59a26a7d5d32eed86d43dc90e7a3bd04aa"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:c4c425f440fb81f8d0237c07b9322fc0fb6ee2b29fbef5f62a322ff8fcce240d"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:323741b6699cd2b04a71cb38f502db98f90532e8a40cb675393d248126a268af"},
+    {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:662c8e105c5e5cee0317d500eb186ed7a93229586e431c1bf0c9236c2407352c"},
+    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f6bd2ab135c64a4d1e9e44679a616c9bc944547357c830fafea5c3caa3de5153"},
+    {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:2f55c1e0e2ae9bdd23b3c63459ee4c06d223b68aeb1961d83c48fb63dc29bc03"},
+    {file = "grpcio-1.67.0-cp313-cp313-win32.whl", hash = "sha256:fd6bc27861e460fe28e94226e3673d46e294ca4673d46b224428d197c5935e69"},
+    {file = "grpcio-1.67.0-cp313-cp313-win_amd64.whl", hash = "sha256:cf51d28063338608cd8d3cd64677e922134837902b70ce00dad7f116e3998210"},
+    {file = "grpcio-1.67.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:7f200aca719c1c5dc72ab68be3479b9dafccdf03df530d137632c534bb6f1ee3"},
+    {file = "grpcio-1.67.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0892dd200ece4822d72dd0952f7112c542a487fc48fe77568deaaa399c1e717d"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f4d613fbf868b2e2444f490d18af472ccb47660ea3df52f068c9c8801e1f3e85"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c69bf11894cad9da00047f46584d5758d6ebc9b5950c0dc96fec7e0bce5cde9"},
+    {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9bca3ca0c5e74dea44bf57d27e15a3a3996ce7e5780d61b7c72386356d231db"},
+    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:014dfc020e28a0d9be7e93a91f85ff9f4a87158b7df9952fe23cc42d29d31e1e"},
+    {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4ea4509d42c6797539e9ec7496c15473177ce9abc89bc5c71e7abe50fc25737"},
+    {file = "grpcio-1.67.0-cp38-cp38-win32.whl", hash = "sha256:9d75641a2fca9ae1ae86454fd25d4c298ea8cc195dbc962852234d54a07060ad"},
+    {file = "grpcio-1.67.0-cp38-cp38-win_amd64.whl", hash = "sha256:cff8e54d6a463883cda2fab94d2062aad2f5edd7f06ae3ed030f2a74756db365"},
+    {file = "grpcio-1.67.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:62492bd534979e6d7127b8a6b29093161a742dee3875873e01964049d5250a74"},
+    {file = "grpcio-1.67.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eef1dce9d1a46119fd09f9a992cf6ab9d9178b696382439446ca5f399d7b96fe"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f623c57a5321461c84498a99dddf9d13dac0e40ee056d884d6ec4ebcab647a78"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54d16383044e681f8beb50f905249e4e7261dd169d4aaf6e52eab67b01cbbbe2"},
+    {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2a44e572fb762c668e4812156b81835f7aba8a721b027e2d4bb29fb50ff4d33"},
+    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:391df8b0faac84d42f5b8dfc65f5152c48ed914e13c522fd05f2aca211f8bfad"},
+    {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfd9306511fdfc623a1ba1dc3bc07fbd24e6cfbe3c28b4d1e05177baa2f99617"},
+    {file = "grpcio-1.67.0-cp39-cp39-win32.whl", hash = "sha256:30d47dbacfd20cbd0c8be9bfa52fdb833b395d4ec32fe5cff7220afc05d08571"},
+    {file = "grpcio-1.67.0-cp39-cp39-win_amd64.whl", hash = "sha256:f55f077685f61f0fbd06ea355142b71e47e4a26d2d678b3ba27248abfe67163a"},
+    {file = "grpcio-1.67.0.tar.gz", hash = "sha256:e090b2553e0da1c875449c8e75073dd4415dd71c9bde6a406240fdf4c0ee467c"},
 ]
 
 [package.extras]
-protobuf = ["grpcio-tools (>=1.66.1)"]
+protobuf = ["grpcio-tools (>=1.67.0)"]
 
 [[package]]
 name = "grpcio-reflection"
@@ -1018,13 +1042,13 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-
 
 [[package]]
 name = "jsonschema-specifications"
-version = "2023.12.1"
+version = "2024.10.1"
 description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "jsonschema_specifications-2023.12.1-py3-none-any.whl", hash = "sha256:87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c"},
-    {file = "jsonschema_specifications-2023.12.1.tar.gz", hash = "sha256:48a76787b3e70f5ed53f1160d2b81f586e4ca6d1548c5de7085d1682674764cc"},
+    {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"},
+    {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"},
 ]
 
 [package.dependencies]
@@ -1121,81 +1145,82 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
 
 [[package]]
 name = "markupsafe"
-version = "2.1.5"
+version = "3.0.2"
 description = "Safely add untrusted strings to HTML/XML markup."
 optional = true
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 files = [
-    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"},
-    {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"},
-    {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"},
-    {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"},
-    {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"},
-    {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"},
-    {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"},
-    {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"},
+    {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"},
+    {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"},
+    {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"},
+    {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"},
+    {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"},
+    {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"},
 ]
 
 [[package]]
 name = "marlin-kernels"
-version = "0.2.0"
+version = "0.3.0"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:9a5afcf19b0f5917e43353cc19873fb3c4d4d0b924e2a95a37884f9ce208d0bd"},
+    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:a2086b9e98d22071f52c5b4b4b98b1b4a988565258905173fa74c5a9eddd1a0a"},
 ]
 
 [package.dependencies]
@@ -1203,16 +1228,16 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl"
 
 [[package]]
 name = "marlin-kernels"
-version = "0.2.0"
+version = "0.3.0"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:1e64fcc7ebadfaffa60091ee9201ae3daaf5c1be3be60c8c054143a3dcb72d5d"},
+    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:f39a6946d8247629446ec170832d832c7038c363f1d8803211fe67249c2d804d"},
 ]
 
 [package.dependencies]
@@ -1220,16 +1245,16 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl"
 
 [[package]]
 name = "marlin-kernels"
-version = "0.2.0"
+version = "0.3.0"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:e75f3ce9b1c13a4ed43a380d88e1d34d297259452db037ec1973ec33dc2eb78e"},
+    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:07fd869d5289777fa866107dae676523e18b1f6ba4afce79946ddc58a6870169"},
 ]
 
 [package.dependencies]
@@ -1237,16 +1262,16 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl"
 
 [[package]]
 name = "marlin-kernels"
-version = "0.2.0"
+version = "0.3.0"
 description = "Marlin quantization kernels"
 optional = true
 python-versions = ">=3.7"
 files = [
-    {file = "marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:2f99a27f70b391887ee6adffeeee7c3f4df7fac37393f9fb16d4cace2b3f6457"},
+    {file = "marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:0dedaa418225d490a5f1d8f85dbc75e439a8c43a8870e4ef32945bf61672d7dc"},
 ]
 
 [package.dependencies]
@@ -1254,7 +1279,7 @@ torch = "*"
 
 [package.source]
 type = "url"
-url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
+url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl"
 
 [[package]]
 name = "mdurl"
@@ -1598,46 +1623,50 @@ files = [
 
 [[package]]
 name = "nvidia-cublas-cu12"
-version = "12.1.3.1"
+version = "12.4.5.8"
 description = "CUBLAS native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"},
-    {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"},
+    {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"},
 ]
 
 [[package]]
 name = "nvidia-cuda-cupti-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA profiling tools runtime libs."
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"},
-    {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"},
+    {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"},
 ]
 
 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVRTC native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"},
-    {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"},
+    {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"},
 ]
 
 [[package]]
 name = "nvidia-cuda-runtime-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "CUDA Runtime native Libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"},
-    {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"},
+    {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"},
 ]
 
 [[package]]
@@ -1656,35 +1685,41 @@ nvidia-cublas-cu12 = "*"
 
 [[package]]
 name = "nvidia-cufft-cu12"
-version = "11.0.2.54"
+version = "11.2.1.3"
 description = "CUFFT native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"},
-    {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"},
+    {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"},
 ]
 
+[package.dependencies]
+nvidia-nvjitlink-cu12 = "*"
+
 [[package]]
 name = "nvidia-curand-cu12"
-version = "10.3.2.106"
+version = "10.3.5.147"
 description = "CURAND native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"},
-    {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"},
+    {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"},
 ]
 
 [[package]]
 name = "nvidia-cusolver-cu12"
-version = "11.4.5.107"
+version = "11.6.1.9"
 description = "CUDA solver native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"},
-    {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"},
+    {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"},
 ]
 
 [package.dependencies]
@@ -1694,13 +1729,14 @@ nvidia-nvjitlink-cu12 = "*"
 
 [[package]]
 name = "nvidia-cusparse-cu12"
-version = "12.1.0.106"
+version = "12.3.1.170"
 description = "CUSPARSE native runtime libraries"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"},
-    {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"},
+    {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"},
 ]
 
 [package.dependencies]
@@ -1719,36 +1755,35 @@ files = [
 
 [[package]]
 name = "nvidia-nccl-cu12"
-version = "2.20.5"
+version = "2.21.5"
 description = "NVIDIA Collective Communication Library (NCCL) Runtime"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"},
-    {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"},
+    {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"},
 ]
 
 [[package]]
 name = "nvidia-nvjitlink-cu12"
-version = "12.6.68"
+version = "12.4.127"
 description = "Nvidia JIT LTO Library"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_aarch64.whl", hash = "sha256:b3fd0779845f68b92063ab1393abab1ed0a23412fc520df79a8190d098b5cd6b"},
-    {file = "nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_x86_64.whl", hash = "sha256:125a6c2a44e96386dda634e13d944e60b07a0402d391a070e8fb4104b34ea1ab"},
-    {file = "nvidia_nvjitlink_cu12-12.6.68-py3-none-win_amd64.whl", hash = "sha256:a55744c98d70317c5e23db14866a8cc2b733f7324509e941fc96276f9f37801d"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
 ]
 
 [[package]]
 name = "nvidia-nvtx-cu12"
-version = "12.1.105"
+version = "12.4.127"
 description = "NVIDIA Tools Extension"
 optional = true
 python-versions = ">=3"
 files = [
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"},
-    {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"},
+    {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
 ]
 
 [[package]]
@@ -2200,6 +2235,113 @@ files = [
 [package.extras]
 twisted = ["twisted"]
 
+[[package]]
+name = "propcache"
+version = "0.2.0"
+description = "Accelerated property cache"
+optional = true
+python-versions = ">=3.8"
+files = [
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"},
+    {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"},
+    {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"},
+    {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"},
+    {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"},
+    {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"},
+    {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"},
+    {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"},
+    {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"},
+    {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"},
+    {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"},
+    {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"},
+    {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"},
+    {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"},
+    {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"},
+    {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"},
+    {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"},
+    {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"},
+    {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"},
+    {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"},
+    {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"},
+    {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"},
+    {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"},
+    {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"},
+    {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"},
+    {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"},
+    {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"},
+    {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"},
+    {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"},
+    {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"},
+    {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"},
+    {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"},
+    {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"},
+]
+
 [[package]]
 name = "protobuf"
 version = "4.25.5"
@@ -2222,32 +2364,33 @@ files = [
 
 [[package]]
 name = "psutil"
-version = "6.0.0"
+version = "6.1.0"
 description = "Cross-platform lib for process and system monitoring in Python."
 optional = true
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
-    {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"},
-    {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"},
-    {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a3dbfb4de4f18174528d87cc352d1f788b7496991cca33c6996f40c9e3c92c"},
-    {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6ec7588fb3ddaec7344a825afe298db83fe01bfaaab39155fa84cf1c0d6b13c3"},
-    {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e7c870afcb7d91fdea2b37c24aeb08f98b6d67257a5cb0a8bc3ac68d0f1a68c"},
-    {file = "psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35"},
-    {file = "psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1"},
-    {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"},
-    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"},
-    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"},
-    {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"},
-    {file = "psutil-6.0.0-cp36-cp36m-win32.whl", hash = "sha256:fc8c9510cde0146432bbdb433322861ee8c3efbf8589865c8bf8d21cb30c4d14"},
-    {file = "psutil-6.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:34859b8d8f423b86e4385ff3665d3f4d94be3cdf48221fbe476e883514fdb71c"},
-    {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"},
-    {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"},
-    {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"},
-    {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"},
+    {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"},
+    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"},
+    {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"},
+    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"},
+    {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"},
+    {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"},
+    {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"},
+    {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"},
+    {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"},
+    {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"},
+    {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"},
+    {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"},
+    {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"},
+    {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"},
+    {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"},
 ]
 
 [package.extras]
-test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
+dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"]
+test = ["pytest", "pytest-xdist", "setuptools"]
 
 [[package]]
 name = "py-cpuinfo"
@@ -2696,18 +2839,19 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "rich"
-version = "13.8.1"
+version = "13.9.3"
 description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal"
 optional = false
-python-versions = ">=3.7.0"
+python-versions = ">=3.8.0"
 files = [
-    {file = "rich-13.8.1-py3-none-any.whl", hash = "sha256:1760a3c0848469b97b558fc61c85233e3dafb69c7a071b4d60c38099d3cd4c06"},
-    {file = "rich-13.8.1.tar.gz", hash = "sha256:8260cda28e3db6bf04d2d1ef4dbc03ba80a824c88b0e7668a0f23126a424844a"},
+    {file = "rich-13.9.3-py3-none-any.whl", hash = "sha256:9836f5096eb2172c9e77df411c1b009bace4193d6a481d534fea75ebba758283"},
+    {file = "rich-13.9.3.tar.gz", hash = "sha256:bc1e01b899537598cf02579d2b9f4a415104d3fc439313a7a2c165d76557a08e"},
 ]
 
 [package.dependencies]
 markdown-it-py = ">=2.2.0"
 pygments = ">=2.13.0,<3.0.0"
+typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
 jupyter = ["ipywidgets (>=7.5.1,<9)"]
@@ -3062,13 +3206,13 @@ files = [
 
 [[package]]
 name = "setuptools"
-version = "75.1.0"
+version = "75.2.0"
 description = "Easily download, build, install, upgrade, and uninstall Python packages"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "setuptools-75.1.0-py3-none-any.whl", hash = "sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2"},
-    {file = "setuptools-75.1.0.tar.gz", hash = "sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538"},
+    {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"},
+    {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"},
 ]
 
 [package.extras]
@@ -3093,13 +3237,13 @@ files = [
 
 [[package]]
 name = "sympy"
-version = "1.13.3"
+version = "1.13.1"
 description = "Computer algebra system (CAS) in Python"
 optional = true
 python-versions = ">=3.8"
 files = [
-    {file = "sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73"},
-    {file = "sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9"},
+    {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"},
+    {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"},
 ]
 
 [package.dependencies]
@@ -3121,111 +3265,111 @@ files = [
 
 [[package]]
 name = "tokenizers"
-version = "0.20.0"
+version = "0.20.1"
 description = ""
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "tokenizers-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6cff5c5e37c41bc5faa519d6f3df0679e4b37da54ea1f42121719c5e2b4905c0"},
-    {file = "tokenizers-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:62a56bf75c27443432456f4ca5ca055befa95e25be8a28141cc495cac8ae4d6d"},
-    {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68cc7de6a63f09c4a86909c2597b995aa66e19df852a23aea894929c74369929"},
-    {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:053c37ecee482cc958fdee53af3c6534286a86f5d35aac476f7c246830e53ae5"},
-    {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3d7074aaabc151a6363fa03db5493fc95b423b2a1874456783989e96d541c7b6"},
-    {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a11435780f2acd89e8fefe5e81cecf01776f6edb9b3ac95bcb76baee76b30b90"},
-    {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9a81cd2712973b007d84268d45fc3f6f90a79c31dfe7f1925e6732f8d2959987"},
-    {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7dfd796ab9d909f76fb93080e1c7c8309f196ecb316eb130718cd5e34231c69"},
-    {file = "tokenizers-0.20.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:8029ad2aa8cb00605c9374566034c1cc1b15130713e0eb5afcef6cface8255c9"},
-    {file = "tokenizers-0.20.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ca4d54260ebe97d59dfa9a30baa20d0c4dd9137d99a8801700055c561145c24e"},
-    {file = "tokenizers-0.20.0-cp310-none-win32.whl", hash = "sha256:95ee16b57cec11b86a7940174ec5197d506439b0f415ab3859f254b1dffe9df0"},
-    {file = "tokenizers-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:0a61a11e93eeadbf02aea082ffc75241c4198e0608bbbac4f65a9026851dcf37"},
-    {file = "tokenizers-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6636b798b3c4d6c9b1af1a918bd07c867808e5a21c64324e95318a237e6366c3"},
-    {file = "tokenizers-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5ec603e42eaf499ffd58b9258162add948717cf21372458132f14e13a6bc7172"},
-    {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cce124264903a8ea6f8f48e1cc7669e5ef638c18bd4ab0a88769d5f92debdf7f"},
-    {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07bbeba0231cf8de07aa6b9e33e9779ff103d47042eeeb859a8c432e3292fb98"},
-    {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:06c0ca8397b35d38b83a44a9c6929790c1692957d88541df061cb34d82ebbf08"},
-    {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca6557ac3b83d912dfbb1f70ab56bd4b0594043916688e906ede09f42e192401"},
-    {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a5ad94c9e80ac6098328bee2e3264dbced4c6faa34429994d473f795ec58ef4"},
-    {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5c7f906ee6bec30a9dc20268a8b80f3b9584de1c9f051671cb057dc6ce28f6"},
-    {file = "tokenizers-0.20.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:31e087e9ee1b8f075b002bfee257e858dc695f955b43903e1bb4aa9f170e37fe"},
-    {file = "tokenizers-0.20.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c3124fb6f3346cb3d8d775375d3b429bf4dcfc24f739822702009d20a4297990"},
-    {file = "tokenizers-0.20.0-cp311-none-win32.whl", hash = "sha256:a4bb8b40ba9eefa621fdcabf04a74aa6038ae3be0c614c6458bd91a4697a452f"},
-    {file = "tokenizers-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:2b709d371f1fe60a28ef0c5c67815952d455ca7f34dbe7197eaaed3cc54b658e"},
-    {file = "tokenizers-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:15c81a17d0d66f4987c6ca16f4bea7ec253b8c7ed1bb00fdc5d038b1bb56e714"},
-    {file = "tokenizers-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6a531cdf1fb6dc41c984c785a3b299cb0586de0b35683842a3afbb1e5207f910"},
-    {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06caabeb4587f8404e0cd9d40f458e9cba3e815c8155a38e579a74ff3e2a4301"},
-    {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8768f964f23f5b9f50546c0369c75ab3262de926983888bbe8b98be05392a79c"},
-    {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:626403860152c816f97b649fd279bd622c3d417678c93b4b1a8909b6380b69a8"},
-    {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c1b88fa9e5ff062326f4bf82681da5a96fca7104d921a6bd7b1e6fcf224af26"},
-    {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d7e559436a07dc547f22ce1101f26d8b2fad387e28ec8e7e1e3b11695d681d8"},
-    {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e48afb75e50449848964e4a67b0da01261dd3aa8df8daecf10db8fd7f5b076eb"},
-    {file = "tokenizers-0.20.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:baf5d0e1ff44710a95eefc196dd87666ffc609fd447c5e5b68272a7c3d342a1d"},
-    {file = "tokenizers-0.20.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e5e56df0e8ed23ba60ae3848c3f069a0710c4b197218fe4f89e27eba38510768"},
-    {file = "tokenizers-0.20.0-cp312-none-win32.whl", hash = "sha256:ec53e5ecc142a82432f9c6c677dbbe5a2bfee92b8abf409a9ecb0d425ee0ce75"},
-    {file = "tokenizers-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:f18661ece72e39c0dfaa174d6223248a15b457dbd4b0fc07809b8e6d3ca1a234"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:f7065b1084d8d1a03dc89d9aad69bcbc8415d4bc123c367063eb32958cd85054"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e5d4069e4714e3f7ba0a4d3d44f9d84a432cd4e4aa85c3d7dd1f51440f12e4a1"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:799b808529e54b7e1a36350bda2aeb470e8390e484d3e98c10395cee61d4e3c6"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f9baa027cc8a281ad5f7725a93c204d7a46986f88edbe8ef7357f40a23fb9c7"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:010ec7f3f7a96adc4c2a34a3ada41fa14b4b936b5628b4ff7b33791258646c6b"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98d88f06155335b14fd78e32ee28ca5b2eb30fced4614e06eb14ae5f7fba24ed"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e13eb000ef540c2280758d1b9cfa5fe424b0424ae4458f440e6340a4f18b2638"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fab3cf066ff426f7e6d70435dc28a9ff01b2747be83810e397cba106f39430b0"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:39fa3761b30a89368f322e5daf4130dce8495b79ad831f370449cdacfb0c0d37"},
-    {file = "tokenizers-0.20.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c8da0fba4d179ddf2607821575998df3c294aa59aa8df5a6646dc64bc7352bce"},
-    {file = "tokenizers-0.20.0-cp37-none-win32.whl", hash = "sha256:fada996d6da8cf213f6e3c91c12297ad4f6cdf7a85c2fadcd05ec32fa6846fcd"},
-    {file = "tokenizers-0.20.0-cp37-none-win_amd64.whl", hash = "sha256:7d29aad702279e0760c265fcae832e89349078e3418dd329732d4503259fd6bd"},
-    {file = "tokenizers-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:099c68207f3ef0227ecb6f80ab98ea74de559f7b124adc7b17778af0250ee90a"},
-    {file = "tokenizers-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:68012d8a8cddb2eab3880870d7e2086cb359c7f7a2b03f5795044f5abff4e850"},
-    {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9253bdd209c6aee168deca7d0e780581bf303e0058f268f9bb06859379de19b6"},
-    {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f868600ddbcb0545905ed075eb7218a0756bf6c09dae7528ea2f8436ebd2c93"},
-    {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9a9643d9c8c5f99b6aba43fd10034f77cc6c22c31f496d2f0ee183047d948fa0"},
-    {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c375c6a889aeab44734028bc65cc070acf93ccb0f9368be42b67a98e1063d3f6"},
-    {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e359f852328e254f070bbd09a19a568421d23388f04aad9f2fb7da7704c7228d"},
-    {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d98b01a309d4387f3b1c1dd68a8b8136af50376cf146c1b7e8d8ead217a5be4b"},
-    {file = "tokenizers-0.20.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:459f7537119554c2899067dec1ac74a00d02beef6558f4ee2e99513bf6d568af"},
-    {file = "tokenizers-0.20.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:392b87ec89452628c045c9f2a88bc2a827f4c79e7d84bc3b72752b74c2581f70"},
-    {file = "tokenizers-0.20.0-cp38-none-win32.whl", hash = "sha256:55a393f893d2ed4dd95a1553c2e42d4d4086878266f437b03590d3f81984c4fe"},
-    {file = "tokenizers-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:30ffe33c5c2f2aab8e9a3340d0110dd9f7ace7eec7362e20a697802306bd8068"},
-    {file = "tokenizers-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:aa2d4a6fed2a7e3f860c7fc9d48764bb30f2649d83915d66150d6340e06742b8"},
-    {file = "tokenizers-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b5ef0f814084a897e9071fc4a868595f018c5c92889197bdc4bf19018769b148"},
-    {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc1e1b791e8c3bf4c4f265f180dadaff1c957bf27129e16fdd5e5d43c2d3762c"},
-    {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b69e55e481459c07885263743a0d3c18d52db19bae8226a19bcca4aaa213fff"},
-    {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4806b4d82e27a2512bc23057b2986bc8b85824914286975b84d8105ff40d03d9"},
-    {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9859e9ef13adf5a473ccab39d31bff9c550606ae3c784bf772b40f615742a24f"},
-    {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef703efedf4c20488a8eb17637b55973745b27997ff87bad88ed499b397d1144"},
-    {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6eec0061bab94b1841ab87d10831fdf1b48ebaed60e6d66d66dbe1d873f92bf5"},
-    {file = "tokenizers-0.20.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:980f3d0d7e73f845b69087f29a63c11c7eb924c4ad6b358da60f3db4cf24bdb4"},
-    {file = "tokenizers-0.20.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7c157550a2f3851b29d7fdc9dc059fcf81ff0c0fc49a1e5173a89d533ed043fa"},
-    {file = "tokenizers-0.20.0-cp39-none-win32.whl", hash = "sha256:8a3d2f4d08608ec4f9895ec25b4b36a97f05812543190a5f2c3cd19e8f041e5a"},
-    {file = "tokenizers-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:d90188d12afd0c75e537f9a1d92f9c7375650188ee4f48fdc76f9e38afbd2251"},
-    {file = "tokenizers-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:d68e15f1815357b059ec266062340c343ea7f98f7f330602df81ffa3474b6122"},
-    {file = "tokenizers-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:23f9ecec637b9bc80da5f703808d29ed5329e56b5aa8d791d1088014f48afadc"},
-    {file = "tokenizers-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f830b318ee599e3d0665b3e325f85bc75ee2d2ca6285f52e439dc22b64691580"},
-    {file = "tokenizers-0.20.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3dc750def789cb1de1b5a37657919545e1d9ffa667658b3fa9cb7862407a1b8"},
-    {file = "tokenizers-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e26e6c755ae884c2ea6135cd215bdd0fccafe4ee62405014b8c3cd19954e3ab9"},
-    {file = "tokenizers-0.20.0-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:a1158c7174f427182e08baa2a8ded2940f2b4a3e94969a85cc9cfd16004cbcea"},
-    {file = "tokenizers-0.20.0-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:6324826287a3fc198898d3dcf758fe4a8479e42d6039f4c59e2cedd3cf92f64e"},
-    {file = "tokenizers-0.20.0-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7d8653149405bb0c16feaf9cfee327fdb6aaef9dc2998349fec686f35e81c4e2"},
-    {file = "tokenizers-0.20.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8a2dc1e402a155e97309287ca085c80eb1b7fab8ae91527d3b729181639fa51"},
-    {file = "tokenizers-0.20.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07bef67b20aa6e5f7868c42c7c5eae4d24f856274a464ae62e47a0f2cccec3da"},
-    {file = "tokenizers-0.20.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da06e397182ff53789c506c7833220c192952c57e1581a53f503d8d953e2d67e"},
-    {file = "tokenizers-0.20.0-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:302f7e11a14814028b7fc88c45a41f1bbe9b5b35fd76d6869558d1d1809baa43"},
-    {file = "tokenizers-0.20.0-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:055ec46e807b875589dfbe3d9259f9a6ee43394fb553b03b3d1e9541662dbf25"},
-    {file = "tokenizers-0.20.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e3144b8acebfa6ae062e8f45f7ed52e4b50fb6c62f93afc8871b525ab9fdcab3"},
-    {file = "tokenizers-0.20.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b52aa3fd14b2a07588c00a19f66511cff5cca8f7266ca3edcdd17f3512ad159f"},
-    {file = "tokenizers-0.20.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b8cf52779ffc5d4d63a0170fbeb512372bad0dd014ce92bbb9149756c831124"},
-    {file = "tokenizers-0.20.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:983a45dd11a876124378dae71d6d9761822199b68a4c73f32873d8cdaf326a5b"},
-    {file = "tokenizers-0.20.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6b819c9a19831ebec581e71a7686a54ab45d90faf3842269a10c11d746de0c"},
-    {file = "tokenizers-0.20.0-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e738cfd80795fcafcef89c5731c84b05638a4ab3f412f97d5ed7765466576eb1"},
-    {file = "tokenizers-0.20.0-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:c8842c7be2fadb9c9edcee233b1b7fe7ade406c99b0973f07439985c1c1d0683"},
-    {file = "tokenizers-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e47a82355511c373a4a430c4909dc1e518e00031207b1fec536c49127388886b"},
-    {file = "tokenizers-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:9afbf359004551179a5db19424180c81276682773cff2c5d002f6eaaffe17230"},
-    {file = "tokenizers-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07eaa8799a92e6af6f472c21a75bf71575de2af3c0284120b7a09297c0de2f3"},
-    {file = "tokenizers-0.20.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0994b2e5fc53a301071806bc4303e4bc3bdc3f490e92a21338146a36746b0872"},
-    {file = "tokenizers-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b6466e0355b603d10e3cc3d282d350b646341b601e50969464a54939f9848d0"},
-    {file = "tokenizers-0.20.0-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1e86594c2a433cb1ea09cfbe596454448c566e57ee8905bd557e489d93e89986"},
-    {file = "tokenizers-0.20.0-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:3e14cdef1efa96ecead6ea64a891828432c3ebba128bdc0596e3059fea104ef3"},
-    {file = "tokenizers-0.20.0.tar.gz", hash = "sha256:39d7acc43f564c274085cafcd1dae9d36f332456de1a31970296a6b8da4eac8d"},
+    {file = "tokenizers-0.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:439261da7c0a5c88bda97acb284d49fbdaf67e9d3b623c0bfd107512d22787a9"},
+    {file = "tokenizers-0.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03dae629d99068b1ea5416d50de0fea13008f04129cc79af77a2a6392792d93c"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61f561f329ffe4b28367798b89d60c4abf3f815d37413b6352bc6412a359867"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec870fce1ee5248a10be69f7a8408a234d6f2109f8ea827b4f7ecdbf08c9fd15"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d388d1ea8b7447da784e32e3b86a75cce55887e3b22b31c19d0b186b1c677800"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:299c85c1d21135bc01542237979bf25c32efa0d66595dd0069ae259b97fb2dbe"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e96f6c14c9752bb82145636b614d5a78e9cde95edfbe0a85dad0dd5ddd6ec95c"},
+    {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9e95ad49c932b80abfbfeaf63b155761e695ad9f8a58c52a47d962d76e310f"},
+    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f22dee205329a636148c325921c73cf3e412e87d31f4d9c3153b302a0200057b"},
+    {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2ffd9a8895575ac636d44500c66dffaef133823b6b25067604fa73bbc5ec09d"},
+    {file = "tokenizers-0.20.1-cp310-none-win32.whl", hash = "sha256:2847843c53f445e0f19ea842a4e48b89dd0db4e62ba6e1e47a2749d6ec11f50d"},
+    {file = "tokenizers-0.20.1-cp310-none-win_amd64.whl", hash = "sha256:f9aa93eacd865f2798b9e62f7ce4533cfff4f5fbd50c02926a78e81c74e432cd"},
+    {file = "tokenizers-0.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4a717dcb08f2dabbf27ae4b6b20cbbb2ad7ed78ce05a829fae100ff4b3c7ff15"},
+    {file = "tokenizers-0.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f84dad1ff1863c648d80628b1b55353d16303431283e4efbb6ab1af56a75832"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:929c8f3afa16a5130a81ab5079c589226273ec618949cce79b46d96e59a84f61"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d10766473954397e2d370f215ebed1cc46dcf6fd3906a2a116aa1d6219bfedc3"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9300fac73ddc7e4b0330acbdda4efaabf74929a4a61e119a32a181f534a11b47"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ecaf7b0e39caeb1aa6dd6e0975c405716c82c1312b55ac4f716ef563a906969"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5170be9ec942f3d1d317817ced8d749b3e1202670865e4fd465e35d8c259de83"},
+    {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f1ae08fa9aea5891cbd69df29913e11d3841798e0bfb1ff78b78e4e7ea0a4"},
+    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ee86d4095d3542d73579e953c2e5e07d9321af2ffea6ecc097d16d538a2dea16"},
+    {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:86dcd08da163912e17b27bbaba5efdc71b4fbffb841530fdb74c5707f3c49216"},
+    {file = "tokenizers-0.20.1-cp311-none-win32.whl", hash = "sha256:9af2dc4ee97d037bc6b05fa4429ddc87532c706316c5e11ce2f0596dfcfa77af"},
+    {file = "tokenizers-0.20.1-cp311-none-win_amd64.whl", hash = "sha256:899152a78b095559c287b4c6d0099469573bb2055347bb8154db106651296f39"},
+    {file = "tokenizers-0.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:407ab666b38e02228fa785e81f7cf79ef929f104bcccf68a64525a54a93ceac9"},
+    {file = "tokenizers-0.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f13a2d16032ebc8bd812eb8099b035ac65887d8f0c207261472803b9633cf3e"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e98eee4dca22849fbb56a80acaa899eec5b72055d79637dd6aa15d5e4b8628c9"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47c1bcdd61e61136087459cb9e0b069ff23b5568b008265e5cbc927eae3387ce"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128c1110e950534426e2274837fc06b118ab5f2fa61c3436e60e0aada0ccfd67"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2e2d47a819d2954f2c1cd0ad51bb58ffac6f53a872d5d82d65d79bf76b9896d"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bdd67a0e3503a9a7cf8bc5a4a49cdde5fa5bada09a51e4c7e1c73900297539bd"},
+    {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689b93d2e26d04da337ac407acec8b5d081d8d135e3e5066a88edd5bdb5aff89"},
+    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0c6a796ddcd9a19ad13cf146997cd5895a421fe6aec8fd970d69f9117bddb45c"},
+    {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3ea919687aa7001a8ff1ba36ac64f165c4e89035f57998fa6cedcfd877be619d"},
+    {file = "tokenizers-0.20.1-cp312-none-win32.whl", hash = "sha256:6d3ac5c1f48358ffe20086bf065e843c0d0a9fce0d7f0f45d5f2f9fba3609ca5"},
+    {file = "tokenizers-0.20.1-cp312-none-win_amd64.whl", hash = "sha256:b0874481aea54a178f2bccc45aa2d0c99cd3f79143a0948af6a9a21dcc49173b"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:96af92e833bd44760fb17f23f402e07a66339c1dcbe17d79a9b55bb0cc4f038e"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:65f34e5b731a262dfa562820818533c38ce32a45864437f3d9c82f26c139ca7f"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17f98fccb5c12ab1ce1f471731a9cd86df5d4bd2cf2880c5a66b229802d96145"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8c0fc3542cf9370bf92c932eb71bdeb33d2d4aeeb4126d9fd567b60bd04cb30"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b39356df4575d37f9b187bb623aab5abb7b62c8cb702867a1768002f814800c"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfdad27b0e50544f6b838895a373db6114b85112ba5c0cefadffa78d6daae563"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:094663dd0e85ee2e573126918747bdb40044a848fde388efb5b09d57bc74c680"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e4cf033a2aa207d7ac790e91adca598b679999710a632c4a494aab0fc3a1b2"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9310951c92c9fb91660de0c19a923c432f110dbfad1a2d429fbc44fa956bf64f"},
+    {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05e41e302c315bd2ed86c02e917bf03a6cf7d2f652c9cee1a0eb0d0f1ca0d32c"},
+    {file = "tokenizers-0.20.1-cp37-none-win32.whl", hash = "sha256:212231ab7dfcdc879baf4892ca87c726259fa7c887e1688e3f3cead384d8c305"},
+    {file = "tokenizers-0.20.1-cp37-none-win_amd64.whl", hash = "sha256:896195eb9dfdc85c8c052e29947169c1fcbe75a254c4b5792cdbd451587bce85"},
+    {file = "tokenizers-0.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:741fb22788482d09d68e73ece1495cfc6d9b29a06c37b3df90564a9cfa688e6d"},
+    {file = "tokenizers-0.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10be14ebd8082086a342d969e17fc2d6edc856c59dbdbddd25f158fa40eaf043"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:514cf279b22fa1ae0bc08e143458c74ad3b56cd078b319464959685a35c53d5e"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a647c5b7cb896d6430cf3e01b4e9a2d77f719c84cefcef825d404830c2071da2"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cdf379219e1e1dd432091058dab325a2e6235ebb23e0aec8d0508567c90cd01"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ba72260449e16c4c2f6f3252823b059fbf2d31b32617e582003f2b18b415c39"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:910b96ed87316e4277b23c7bcaf667ce849c7cc379a453fa179e7e09290eeb25"},
+    {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53975a6694428a0586534cc1354b2408d4e010a3103117f617cbb550299797c"},
+    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:07c4b7be58da142b0730cc4e5fd66bb7bf6f57f4986ddda73833cd39efef8a01"},
+    {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b605c540753e62199bf15cf69c333e934077ef2350262af2ccada46026f83d1c"},
+    {file = "tokenizers-0.20.1-cp38-none-win32.whl", hash = "sha256:88b3bc76ab4db1ab95ead623d49c95205411e26302cf9f74203e762ac7e85685"},
+    {file = "tokenizers-0.20.1-cp38-none-win_amd64.whl", hash = "sha256:d412a74cf5b3f68a90c615611a5aa4478bb303d1c65961d22db45001df68afcb"},
+    {file = "tokenizers-0.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a25dcb2f41a0a6aac31999e6c96a75e9152fa0127af8ece46c2f784f23b8197a"},
+    {file = "tokenizers-0.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a12c3cebb8c92e9c35a23ab10d3852aee522f385c28d0b4fe48c0b7527d59762"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02e18da58cf115b7c40de973609c35bde95856012ba42a41ee919c77935af251"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f326a1ac51ae909b9760e34671c26cd0dfe15662f447302a9d5bb2d872bab8ab"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b4872647ea6f25224e2833b044b0b19084e39400e8ead3cfe751238b0802140"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce6238a3311bb8e4c15b12600927d35c267b92a52c881ef5717a900ca14793f7"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57b7a8880b208866508b06ce365dc631e7a2472a3faa24daa430d046fb56c885"},
+    {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a908c69c2897a68f412aa05ba38bfa87a02980df70f5a72fa8490479308b1f2d"},
+    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:da1001aa46f4490099c82e2facc4fbc06a6a32bf7de3918ba798010954b775e0"},
+    {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:42c097390e2f0ed0a5c5d569e6669dd4e9fff7b31c6a5ce6e9c66a61687197de"},
+    {file = "tokenizers-0.20.1-cp39-none-win32.whl", hash = "sha256:3d4d218573a3d8b121a1f8c801029d70444ffb6d8f129d4cca1c7b672ee4a24c"},
+    {file = "tokenizers-0.20.1-cp39-none-win_amd64.whl", hash = "sha256:37d1e6f616c84fceefa7c6484a01df05caf1e207669121c66213cb5b2911d653"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48689da7a395df41114f516208d6550e3e905e1239cc5ad386686d9358e9cef0"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:712f90ea33f9bd2586b4a90d697c26d56d0a22fd3c91104c5858c4b5b6489a79"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:359eceb6a620c965988fc559cebc0a98db26713758ec4df43fb76d41486a8ed5"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d3caf244ce89d24c87545aafc3448be15870096e796c703a0d68547187192e1"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03b03cf8b9a32254b1bf8a305fb95c6daf1baae0c1f93b27f2b08c9759f41dee"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:218e5a3561561ea0f0ef1559c6d95b825308dbec23fb55b70b92589e7ff2e1e8"},
+    {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f40df5e0294a95131cc5f0e0eb91fe86d88837abfbee46b9b3610b09860195a7"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:08aaa0d72bb65058e8c4b0455f61b840b156c557e2aca57627056624c3a93976"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:998700177b45f70afeb206ad22c08d9e5f3a80639dae1032bf41e8cbc4dada4b"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62f7fbd3c2c38b179556d879edae442b45f68312019c3a6013e56c3947a4e648"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31e87fca4f6bbf5cc67481b562147fe932f73d5602734de7dd18a8f2eee9c6dd"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:956f21d359ae29dd51ca5726d2c9a44ffafa041c623f5aa33749da87cfa809b9"},
+    {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1fbbaf17a393c78d8aedb6a334097c91cb4119a9ced4764ab8cfdc8d254dc9f9"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ebe63e31f9c1a970c53866d814e35ec2ec26fda03097c486f82f3891cee60830"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:81970b80b8ac126910295f8aab2d7ef962009ea39e0d86d304769493f69aaa1e"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e35e76f9337ed6c31be386e75d4925ea807055acf18ca1a9b0eec03d8fe23"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd28a8614f5c82a54ab2463554e84ad79526c5184cf4573bbac2efbbbcead457"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9041ee665d0fa7f5c4ccf0f81f5e6b7087f797f85b143c094126fc2611fec9d0"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:62eb9daea2a2c06bcd8113a5824af8ef8ee7405d3a71123ba4d52c79bb3d9f1a"},
+    {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f861889707b54a9ab1204030b65fd6c22bdd4a95205deec7994dc22a8baa2ea4"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:89d5c337d74ea6e5e7dc8af124cf177be843bbb9ca6e58c01f75ea103c12c8a9"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0b7f515c83397e73292accdbbbedc62264e070bae9682f06061e2ddce67cacaf"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0305fc1ec6b1e5052d30d9c1d5c807081a7bd0cae46a33d03117082e91908c"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc611e6ac0fa00a41de19c3bf6391a05ea201d2d22b757d63f5491ec0e67faa"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5ffe0d7f7bfcfa3b2585776ecf11da2e01c317027c8573c78ebcb8985279e23"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e7edb8ec12c100d5458d15b1e47c0eb30ad606a05641f19af7563bc3d1608c14"},
+    {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:de291633fb9303555793cc544d4a86e858da529b7d0b752bcaf721ae1d74b2c9"},
+    {file = "tokenizers-0.20.1.tar.gz", hash = "sha256:84edcc7cdeeee45ceedb65d518fffb77aec69311c9c8e30f77ad84da3025f002"},
 ]
 
 [package.dependencies]
@@ -3238,42 +3382,39 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"]
 
 [[package]]
 name = "tomli"
-version = "2.0.1"
+version = "2.0.2"
 description = "A lil' TOML parser"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
-    {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
+    {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"},
+    {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"},
 ]
 
 [[package]]
 name = "torch"
-version = "2.4.1"
+version = "2.5.0"
 description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration"
 optional = true
 python-versions = ">=3.8.0"
 files = [
-    {file = "torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:362f82e23a4cd46341daabb76fba08f04cd646df9bfaf5da50af97cb60ca4971"},
-    {file = "torch-2.4.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:e8ac1985c3ff0f60d85b991954cfc2cc25f79c84545aead422763148ed2759e3"},
-    {file = "torch-2.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:91e326e2ccfb1496e3bee58f70ef605aeb27bd26be07ba64f37dcaac3d070ada"},
-    {file = "torch-2.4.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:d36a8ef100f5bff3e9c3cea934b9e0d7ea277cb8210c7152d34a9a6c5830eadd"},
-    {file = "torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0b5f88afdfa05a335d80351e3cea57d38e578c8689f751d35e0ff36bce872113"},
-    {file = "torch-2.4.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:ef503165f2341942bfdf2bd520152f19540d0c0e34961232f134dc59ad435be8"},
-    {file = "torch-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:092e7c2280c860eff762ac08c4bdcd53d701677851670695e0c22d6d345b269c"},
-    {file = "torch-2.4.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ddddbd8b066e743934a4200b3d54267a46db02106876d21cf31f7da7a96f98ea"},
-    {file = "torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:fdc4fe11db3eb93c1115d3e973a27ac7c1a8318af8934ffa36b0370efe28e042"},
-    {file = "torch-2.4.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:18835374f599207a9e82c262153c20ddf42ea49bc76b6eadad8e5f49729f6e4d"},
-    {file = "torch-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:ebea70ff30544fc021d441ce6b219a88b67524f01170b1c538d7d3ebb5e7f56c"},
-    {file = "torch-2.4.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:72b484d5b6cec1a735bf3fa5a1c4883d01748698c5e9cfdbeb4ffab7c7987e0d"},
-    {file = "torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:c99e1db4bf0c5347107845d715b4aa1097e601bdc36343d758963055e9599d93"},
-    {file = "torch-2.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b57f07e92858db78c5b72857b4f0b33a65b00dc5d68e7948a8494b0314efb880"},
-    {file = "torch-2.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:f18197f3f7c15cde2115892b64f17c80dbf01ed72b008020e7da339902742cf6"},
-    {file = "torch-2.4.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:5fc1d4d7ed265ef853579caf272686d1ed87cebdcd04f2a498f800ffc53dab71"},
-    {file = "torch-2.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:40f6d3fe3bae74efcf08cb7f8295eaddd8a838ce89e9d26929d4edd6d5e4329d"},
-    {file = "torch-2.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c9299c16c9743001ecef515536ac45900247f4338ecdf70746f2461f9e4831db"},
-    {file = "torch-2.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:6bce130f2cd2d52ba4e2c6ada461808de7e5eccbac692525337cfb4c19421846"},
-    {file = "torch-2.4.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:a38de2803ee6050309aac032676536c3d3b6a9804248537e38e098d0e14817ec"},
+    {file = "torch-2.5.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:7f179373a047b947dec448243f4e6598a1c960fa3bb978a9a7eecd529fbc363f"},
+    {file = "torch-2.5.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:15fbc95e38d330e5b0ef1593b7bc0a19f30e5bdad76895a5cffa1a6a044235e9"},
+    {file = "torch-2.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:f499212f1cffea5d587e5f06144630ed9aa9c399bba12ec8905798d833bd1404"},
+    {file = "torch-2.5.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:c54db1fade17287aabbeed685d8e8ab3a56fea9dd8d46e71ced2da367f09a49f"},
+    {file = "torch-2.5.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:499a68a756d3b30d10f7e0f6214dc3767b130b797265db3b1c02e9094e2a07be"},
+    {file = "torch-2.5.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:9f3df8138a1126a851440b7d5a4869bfb7c9cc43563d64fd9d96d0465b581024"},
+    {file = "torch-2.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b81da3bdb58c9de29d0e1361e52f12fcf10a89673f17a11a5c6c7da1cb1a8376"},
+    {file = "torch-2.5.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ba135923295d564355326dc409b6b7f5bd6edc80f764cdaef1fb0a1b23ff2f9c"},
+    {file = "torch-2.5.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2dd40c885a05ef7fe29356cca81be1435a893096ceb984441d6e2c27aff8c6f4"},
+    {file = "torch-2.5.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:bc52d603d87fe1da24439c0d5fdbbb14e0ae4874451d53f0120ffb1f6c192727"},
+    {file = "torch-2.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:ea718746469246cc63b3353afd75698a288344adb55e29b7f814a5d3c0a7c78d"},
+    {file = "torch-2.5.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6de1fd253e27e7f01f05cd7c37929ae521ca23ca4620cfc7c485299941679112"},
+    {file = "torch-2.5.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:83dcf518685db20912b71fc49cbddcc8849438cdb0e9dcc919b02a849e2cd9e8"},
+    {file = "torch-2.5.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:65e0a60894435608334d68c8811e55fd8f73e5bf8ee6f9ccedb0064486a7b418"},
+    {file = "torch-2.5.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:38c21ff1bd39f076d72ab06e3c88c2ea6874f2e6f235c9450816b6c8e7627094"},
+    {file = "torch-2.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:ce4baeba9804da5a346e210b3b70826f5811330c343e4fe1582200359ee77fe5"},
+    {file = "torch-2.5.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:03e53f577a96e4d41aca472da8faa40e55df89d2273664af390ce1f570e885bd"},
 ]
 
 [package.dependencies]
@@ -3281,25 +3422,26 @@ filelock = "*"
 fsspec = "*"
 jinja2 = "*"
 networkx = "*"
-nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cublas-cu12 = {version = "12.4.5.8", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-cupti-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-nvrtc-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cuda-runtime-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
 nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
-setuptools = "*"
-sympy = "*"
-triton = {version = "3.0.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
+nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""}
+setuptools = {version = "*", markers = "python_version >= \"3.12\""}
+sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""}
+triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""}
 typing-extensions = ">=4.8.0"
 
 [package.extras]
 opt-einsum = ["opt-einsum (>=3.3)"]
-optree = ["optree (>=0.11.0)"]
+optree = ["optree (>=0.12.0)"]
 
 [[package]]
 name = "tqdm"
@@ -3323,13 +3465,13 @@ telegram = ["requests"]
 
 [[package]]
 name = "transformers"
-version = "4.45.0"
+version = "4.45.2"
 description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "transformers-4.45.0-py3-none-any.whl", hash = "sha256:f04a82926676056afb3bbf4df7d76ceb1fc2b2746247a87f3f9be4674adc95d7"},
-    {file = "transformers-4.45.0.tar.gz", hash = "sha256:29629f87965acc7b15e458a24580832d85da18ddc119410211747fe778c200ce"},
+    {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"},
+    {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"},
 ]
 
 [package.dependencies]
@@ -3392,16 +3534,16 @@ vision = ["Pillow (>=10.0.1,<=15.0)"]
 
 [[package]]
 name = "triton"
-version = "3.0.0"
+version = "3.1.0"
 description = "A language and compiler for custom Deep Learning operations"
 optional = true
 python-versions = "*"
 files = [
-    {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"},
-    {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"},
-    {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
-    {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
-    {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
+    {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"},
+    {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"},
+    {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"},
+    {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"},
+    {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"},
 ]
 
 [package.dependencies]
@@ -3698,108 +3840,99 @@ files = [
 
 [[package]]
 name = "yarl"
-version = "1.12.1"
+version = "1.16.0"
 description = "Yet another URL library"
 optional = true
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 files = [
-    {file = "yarl-1.12.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:64c5b0f2b937fe40d0967516eee5504b23cb247b8b7ffeba7213a467d9646fdc"},
-    {file = "yarl-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2e430ac432f969ef21770645743611c1618362309e3ad7cab45acd1ad1a540ff"},
-    {file = "yarl-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3e26e64f42bce5ddf9002092b2c37b13071c2e6413d5c05f9fa9de58ed2f7749"},
-    {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0103c52f8dfe5d573c856322149ddcd6d28f51b4d4a3ee5c4b3c1b0a05c3d034"},
-    {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b63465b53baeaf2122a337d4ab57d6bbdd09fcadceb17a974cfa8a0300ad9c67"},
-    {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17d4dc4ff47893a06737b8788ed2ba2f5ac4e8bb40281c8603920f7d011d5bdd"},
-    {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b54949267bd5704324397efe9fbb6aa306466dee067550964e994d309db5f1"},
-    {file = "yarl-1.12.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10b690cd78cbaca2f96a7462f303fdd2b596d3978b49892e4b05a7567c591572"},
-    {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c85ab016e96a975afbdb9d49ca90f3bca9920ef27c64300843fe91c3d59d8d20"},
-    {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c1caa5763d1770216596e0a71b5567f27aac28c95992110212c108ec74589a48"},
-    {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:595bbcdbfc4a9c6989d7489dca8510cba053ff46b16c84ffd95ac8e90711d419"},
-    {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e64f0421892a207d3780903085c1b04efeb53b16803b23d947de5a7261b71355"},
-    {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:319c206e83e46ec2421b25b300c8482b6fe8a018baca246be308c736d9dab267"},
-    {file = "yarl-1.12.1-cp310-cp310-win32.whl", hash = "sha256:da045bd1147d12bd43fb032296640a7cc17a7f2eaba67495988362e99db24fd2"},
-    {file = "yarl-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:aebbd47df77190ada603157f0b3670d578c110c31746ecc5875c394fdcc59a99"},
-    {file = "yarl-1.12.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:28389a68981676bf74e2e199fe42f35d1aa27a9c98e3a03e6f58d2d3d054afe1"},
-    {file = "yarl-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f736f54565f8dd7e3ab664fef2bc461d7593a389a7f28d4904af8d55a91bd55f"},
-    {file = "yarl-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dee0496d5f1a8f57f0f28a16f81a2033fc057a2cf9cd710742d11828f8c80e2"},
-    {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8981a94a27ac520a398302afb74ae2c0be1c3d2d215c75c582186a006c9e7b0"},
-    {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff54340fc1129e8e181827e2234af3ff659b4f17d9bbe77f43bc19e6577fadec"},
-    {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:54c8cee662b5f8c30ad7eedfc26123f845f007798e4ff1001d9528fe959fd23c"},
-    {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e97a29b37830ba1262d8dfd48ddb5b28ad4d3ebecc5d93a9c7591d98641ec737"},
-    {file = "yarl-1.12.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c89894cc6f6ddd993813e79244b36b215c14f65f9e4f1660b1f2ba9e5594b95"},
-    {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:712ba8722c0699daf186de089ddc4677651eb9875ed7447b2ad50697522cbdd9"},
-    {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6e9a9f50892153bad5046c2a6df153224aa6f0573a5a8ab44fc54a1e886f6e21"},
-    {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1d4017e78fb22bc797c089b746230ad78ecd3cdb215bc0bd61cb72b5867da57e"},
-    {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f494c01b28645c431239863cb17af8b8d15b93b0d697a0320d5dd34cd9d7c2fa"},
-    {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:de4544b1fb29cf14870c4e2b8a897c0242449f5dcebd3e0366aa0aa3cf58a23a"},
-    {file = "yarl-1.12.1-cp311-cp311-win32.whl", hash = "sha256:7564525a4673fde53dee7d4c307a961c0951918f0b8c7f09b2c9e02067cf6504"},
-    {file = "yarl-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:f23bb1a7a6e8e8b612a164fdd08e683bcc16c76f928d6dbb7bdbee2374fbfee6"},
-    {file = "yarl-1.12.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a3e2aff8b822ab0e0bdbed9f50494b3a35629c4b9488ae391659973a37a9f53f"},
-    {file = "yarl-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:22dda2799c8d39041d731e02bf7690f0ef34f1691d9ac9dfcb98dd1e94c8b058"},
-    {file = "yarl-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:18c2a7757561f05439c243f517dbbb174cadfae3a72dee4ae7c693f5b336570f"},
-    {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:835010cc17d0020e7931d39e487d72c8e01c98e669b6896a8b8c9aa8ca69a949"},
-    {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2254fe137c4a360b0a13173a56444f756252c9283ba4d267ca8e9081cd140ea"},
-    {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6a071d2c3d39b4104f94fc08ab349e9b19b951ad4b8e3b6d7ea92d6ef7ccaf8"},
-    {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73a183042ae0918c82ce2df38c3db2409b0eeae88e3afdfc80fb67471a95b33b"},
-    {file = "yarl-1.12.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:326b8a079a9afcac0575971e56dabdf7abb2ea89a893e6949b77adfeb058b50e"},
-    {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:126309c0f52a2219b3d1048aca00766429a1346596b186d51d9fa5d2070b7b13"},
-    {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:ba1c779b45a399cc25f511c681016626f69e51e45b9d350d7581998722825af9"},
-    {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:af1107299cef049ad00a93df4809517be432283a0847bcae48343ebe5ea340dc"},
-    {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:20d817c0893191b2ab0ba30b45b77761e8dfec30a029b7c7063055ca71157f84"},
-    {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d4f818f6371970d6a5d1e42878389bbfb69dcde631e4bbac5ec1cb11158565ca"},
-    {file = "yarl-1.12.1-cp312-cp312-win32.whl", hash = "sha256:0ac33d22b2604b020569a82d5f8a03ba637ba42cc1adf31f616af70baf81710b"},
-    {file = "yarl-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:fd24996e12e1ba7c397c44be75ca299da14cde34d74bc5508cce233676cc68d0"},
-    {file = "yarl-1.12.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dea360778e0668a7ad25d7727d03364de8a45bfd5d808f81253516b9f2217765"},
-    {file = "yarl-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1f50a37aeeb5179d293465e522fd686080928c4d89e0ff215e1f963405ec4def"},
-    {file = "yarl-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0274b1b7a9c9c32b7bf250583e673ff99fb9fccb389215841e2652d9982de740"},
-    {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4f3ab9eb8ab2d585ece959c48d234f7b39ac0ca1954a34d8b8e58a52064bdb3"},
-    {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d31dd0245d88cf7239e96e8f2a99f815b06e458a5854150f8e6f0e61618d41b"},
-    {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a96198d5d26f40557d986c1253bfe0e02d18c9d9b93cf389daf1a3c9f7c755fa"},
-    {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddae504cfb556fe220efae65e35be63cd11e3c314b202723fc2119ce19f0ca2e"},
-    {file = "yarl-1.12.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bce00f3b1f7f644faae89677ca68645ed5365f1c7f874fdd5ebf730a69640d38"},
-    {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eee5ff934b0c9f4537ff9596169d56cab1890918004791a7a06b879b3ba2a7ef"},
-    {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4ea99e64b2ad2635e0f0597b63f5ea6c374791ff2fa81cdd4bad8ed9f047f56f"},
-    {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c667b383529520b8dd6bd496fc318678320cb2a6062fdfe6d3618da6b8790f6"},
-    {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d920401941cb898ef089422e889759dd403309eb370d0e54f1bdf6ca07fef603"},
-    {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:501a1576716032cc6d48c7c47bcdc42d682273415a8f2908e7e72cb4625801f3"},
-    {file = "yarl-1.12.1-cp313-cp313-win32.whl", hash = "sha256:24416bb5e221e29ddf8aac5b97e94e635ca2c5be44a1617ad6fe32556df44294"},
-    {file = "yarl-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:71af3766bb46738d12cc288d9b8de7ef6f79c31fd62757e2b8a505fe3680b27f"},
-    {file = "yarl-1.12.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c924deab8105f86980983eced740433fb7554a7f66db73991affa4eda99d5402"},
-    {file = "yarl-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5fb475a4cdde582c9528bb412b98f899680492daaba318231e96f1a0a1bb0d53"},
-    {file = "yarl-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:36ee0115b9edca904153a66bb74a9ff1ce38caff015de94eadfb9ba8e6ecd317"},
-    {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2631c9d7386bd2d4ce24ecc6ebf9ae90b3efd713d588d90504eaa77fec4dba01"},
-    {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2376d8cf506dffd0e5f2391025ae8675b09711016656590cb03b55894161fcfa"},
-    {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:24197ba3114cc85ddd4091e19b2ddc62650f2e4a899e51b074dfd52d56cf8c72"},
-    {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfdf419bf5d3644f94cd7052954fc233522f5a1b371fc0b00219ebd9c14d5798"},
-    {file = "yarl-1.12.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8112f640a4f7e7bf59f7cabf0d47a29b8977528c521d73a64d5cc9e99e48a174"},
-    {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:607d12f0901f6419a8adceb139847c42c83864b85371f58270e42753f9780fa6"},
-    {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:664380c7ed524a280b6a2d5d9126389c3e96cd6e88986cdb42ca72baa27421d6"},
-    {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:0d0a5e87bc48d76dfcfc16295201e9812d5f33d55b4a0b7cad1025b92bf8b91b"},
-    {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:eff6bac402719c14e17efe845d6b98593c56c843aca6def72080fbede755fd1f"},
-    {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:22839d1d1eab9e4b427828a88a22beb86f67c14d8ff81175505f1cc8493f3500"},
-    {file = "yarl-1.12.1-cp38-cp38-win32.whl", hash = "sha256:717f185086bb9d817d4537dd18d5df5d657598cd00e6fc22e4d54d84de266c1d"},
-    {file = "yarl-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:71978ba778948760cff528235c951ea0ef7a4f9c84ac5a49975f8540f76c3f73"},
-    {file = "yarl-1.12.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:30ffc046ebddccb3c4cac72c1a3e1bc343492336f3ca86d24672e90ccc5e788a"},
-    {file = "yarl-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f10954b233d4df5cc3137ffa5ced97f8894152df817e5d149bf05a0ef2ab8134"},
-    {file = "yarl-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2e912b282466444023610e4498e3795c10e7cfd641744524876239fcf01d538d"},
-    {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6af871f70cfd5b528bd322c65793b5fd5659858cdfaa35fbe563fb99b667ed1f"},
-    {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3e4e1f7b08d1ec6b685ccd3e2d762219c550164fbf524498532e39f9413436e"},
-    {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9a7ee79183f0b17dcede8b6723e7da2ded529cf159a878214be9a5d3098f5b1e"},
-    {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96c8ff1e1dd680e38af0887927cab407a4e51d84a5f02ae3d6eb87233036c763"},
-    {file = "yarl-1.12.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e9905fc2dc1319e4c39837b906a024cf71b1261cc66b0cd89678f779c0c61f5"},
-    {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:01549468858b87d36f967c97d02e6e54106f444aeb947ed76f8f71f85ed07cec"},
-    {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:96b34830bd6825ca0220bf005ea99ac83eb9ce51301ddb882dcf613ae6cd95fb"},
-    {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2aee7594d2c2221c717a8e394bbed4740029df4c0211ceb0f04815686e99c795"},
-    {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:15871130439ad10abb25a4631120d60391aa762b85fcab971411e556247210a0"},
-    {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:838dde2cb570cfbb4cab8a876a0974e8b90973ea40b3ac27a79b8a74c8a2db15"},
-    {file = "yarl-1.12.1-cp39-cp39-win32.whl", hash = "sha256:eacbcf30efaca7dc5cb264228ffecdb95fdb1e715b1ec937c0ce6b734161e0c8"},
-    {file = "yarl-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:76a59d1b63de859398bc7764c860a769499511463c1232155061fe0147f13e01"},
-    {file = "yarl-1.12.1-py3-none-any.whl", hash = "sha256:dc3192a81ecd5ff954cecd690327badd5a84d00b877e1573f7c9097ce13e5bfb"},
-    {file = "yarl-1.12.1.tar.gz", hash = "sha256:5b860055199aec8d6fe4dcee3c5196ce506ca198a50aab0059ffd26e8e815828"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32468f41242d72b87ab793a86d92f885355bcf35b3355aa650bfa846a5c60058"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:234f3a3032b505b90e65b5bc6652c2329ea7ea8855d8de61e1642b74b4ee65d2"},
+    {file = "yarl-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a0296040e5cddf074c7f5af4a60f3fc42c0237440df7bcf5183be5f6c802ed5"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de6c14dd7c7c0badba48157474ea1f03ebee991530ba742d381b28d4f314d6f3"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b140e532fe0266003c936d017c1ac301e72ee4a3fd51784574c05f53718a55d8"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:019f5d58093402aa8f6661e60fd82a28746ad6d156f6c5336a70a39bd7b162b9"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c42998fd1cbeb53cd985bff0e4bc25fbe55fd6eb3a545a724c1012d69d5ec84"},
+    {file = "yarl-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7c30fb38c300fe8140df30a046a01769105e4cf4282567a29b5cdb635b66c4"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e49e0fd86c295e743fd5be69b8b0712f70a686bc79a16e5268386c2defacaade"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:b9ca7b9147eb1365c8bab03c003baa1300599575effad765e0b07dd3501ea9af"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27e11db3f1e6a51081a981509f75617b09810529de508a181319193d320bc5c7"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8994c42f4ca25df5380ddf59f315c518c81df6a68fed5bb0c159c6cb6b92f120"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:542fa8e09a581bcdcbb30607c7224beff3fdfb598c798ccd28a8184ffc18b7eb"},
+    {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2bd6a51010c7284d191b79d3b56e51a87d8e1c03b0902362945f15c3d50ed46b"},
+    {file = "yarl-1.16.0-cp310-cp310-win32.whl", hash = "sha256:178ccb856e265174a79f59721031060f885aca428983e75c06f78aa24b91d929"},
+    {file = "yarl-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe8bba2545427418efc1929c5c42852bdb4143eb8d0a46b09de88d1fe99258e7"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d8643975a0080f361639787415a038bfc32d29208a4bf6b783ab3075a20b1ef3"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:676d96bafc8c2d0039cea0cd3fd44cee7aa88b8185551a2bb93354668e8315c2"},
+    {file = "yarl-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9525f03269e64310416dbe6c68d3b23e5d34aaa8f47193a1c45ac568cecbc49"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b37d5ec034e668b22cf0ce1074d6c21fd2a08b90d11b1b73139b750a8b0dd97"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f32c4cb7386b41936894685f6e093c8dfaf0960124d91fe0ec29fe439e201d0"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b8e265a0545637492a7e12fd7038370d66c9375a61d88c5567d0e044ded9202"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:789a3423f28a5fff46fbd04e339863c169ece97c827b44de16e1a7a42bc915d2"},
+    {file = "yarl-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1d1f45e3e8d37c804dca99ab3cf4ab3ed2e7a62cd82542924b14c0a4f46d243"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:621280719c4c5dad4c1391160a9b88925bb8b0ff6a7d5af3224643024871675f"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed097b26f18a1f5ff05f661dc36528c5f6735ba4ce8c9645e83b064665131349"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f1fe2b2e3ee418862f5ebc0c0083c97f6f6625781382f828f6d4e9b614eba9b"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:87dd10bc0618991c66cee0cc65fa74a45f4ecb13bceec3c62d78ad2e42b27a16"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4199db024b58a8abb2cfcedac7b1292c3ad421684571aeb622a02f242280e8d6"},
+    {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:99a9dcd4b71dd5f5f949737ab3f356cfc058c709b4f49833aeffedc2652dac56"},
+    {file = "yarl-1.16.0-cp311-cp311-win32.whl", hash = "sha256:a9394c65ae0ed95679717d391c862dece9afacd8fa311683fc8b4362ce8a410c"},
+    {file = "yarl-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:5b9101f528ae0f8f65ac9d64dda2bb0627de8a50344b2f582779f32fda747c1d"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4ffb7c129707dd76ced0a4a4128ff452cecf0b0e929f2668ea05a371d9e5c104"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1a5e9d8ce1185723419c487758d81ac2bde693711947032cce600ca7c9cda7d6"},
+    {file = "yarl-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d743e3118b2640cef7768ea955378c3536482d95550222f908f392167fe62059"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26768342f256e6e3c37533bf9433f5f15f3e59e3c14b2409098291b3efaceacb"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1b0796168b953bca6600c5f97f5ed407479889a36ad7d17183366260f29a6b9"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858728086914f3a407aa7979cab743bbda1fe2bdf39ffcd991469a370dd7414d"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5570e6d47bcb03215baf4c9ad7bf7c013e56285d9d35013541f9ac2b372593e7"},
+    {file = "yarl-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66ea8311422a7ba1fc79b4c42c2baa10566469fe5a78500d4e7754d6e6db8724"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:649bddcedee692ee8a9b7b6e38582cb4062dc4253de9711568e5620d8707c2a3"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a91654adb7643cb21b46f04244c5a315a440dcad63213033826549fa2435f71"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b439cae82034ade094526a8f692b9a2b5ee936452de5e4c5f0f6c48df23f8604"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:571f781ae8ac463ce30bacebfaef2c6581543776d5970b2372fbe31d7bf31a07"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:aa7943f04f36d6cafc0cf53ea89824ac2c37acbdb4b316a654176ab8ffd0f968"},
+    {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a5cf32539373ff39d97723e39a9283a7277cbf1224f7aef0c56c9598b6486c3"},
+    {file = "yarl-1.16.0-cp312-cp312-win32.whl", hash = "sha256:a5b6c09b9b4253d6a208b0f4a2f9206e511ec68dce9198e0fbec4f160137aa67"},
+    {file = "yarl-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:1208ca14eed2fda324042adf8d6c0adf4a31522fa95e0929027cd487875f0240"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5ace0177520bd4caa99295a9b6fb831d0e9a57d8e0501a22ffaa61b4c024283"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7118bdb5e3ed81acaa2095cba7ec02a0fe74b52a16ab9f9ac8e28e53ee299732"},
+    {file = "yarl-1.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38fec8a2a94c58bd47c9a50a45d321ab2285ad133adefbbadf3012c054b7e656"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8791d66d81ee45866a7bb15a517b01a2bcf583a18ebf5d72a84e6064c417e64b"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cf936ba67bc6c734f3aa1c01391da74ab7fc046a9f8bbfa230b8393b90cf472"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1aab176dd55b59f77a63b27cffaca67d29987d91a5b615cbead41331e6b7428"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995d0759004c08abd5d1b81300a91d18c8577c6389300bed1c7c11675105a44d"},
+    {file = "yarl-1.16.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bc22e00edeb068f71967ab99081e9406cd56dbed864fc3a8259442999d71552"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:35b4f7842154176523e0a63c9b871168c69b98065d05a4f637fce342a6a2693a"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7ace71c4b7a0c41f317ae24be62bb61e9d80838d38acb20e70697c625e71f120"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8f639e3f5795a6568aa4f7d2ac6057c757dcd187593679f035adbf12b892bb00"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e8be3aff14f0120ad049121322b107f8a759be76a6a62138322d4c8a337a9e2c"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:122d8e7986043d0549e9eb23c7fd23be078be4b70c9eb42a20052b3d3149c6f2"},
+    {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fd9c227990f609c165f56b46107d0bc34553fe0387818c42c02f77974402c36"},
+    {file = "yarl-1.16.0-cp313-cp313-win32.whl", hash = "sha256:595ca5e943baed31d56b33b34736461a371c6ea0038d3baec399949dd628560b"},
+    {file = "yarl-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:921b81b8d78f0e60242fb3db615ea3f368827a76af095d5a69f1c3366db3f596"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab2b2ac232110a1fdb0d3ffcd087783edd3d4a6ced432a1bf75caf7b7be70916"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f8713717a09acbfee7c47bfc5777e685539fefdd34fa72faf504c8be2f3df4e"},
+    {file = "yarl-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cdcffe1dbcb4477d2b4202f63cd972d5baa155ff5a3d9e35801c46a415b7f71a"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a91217208306d82357c67daeef5162a41a28c8352dab7e16daa82e3718852a7"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ab3ed42c78275477ea8e917491365e9a9b69bb615cb46169020bd0aa5e2d6d3"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:707ae579ccb3262dfaef093e202b4c3fb23c3810e8df544b1111bd2401fd7b09"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7a852d1cd0b8d8b37fc9d7f8581152add917a98cfe2ea6e241878795f917ae"},
+    {file = "yarl-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3f1cc3d3d4dc574bebc9b387f6875e228ace5748a7c24f49d8f01ac1bc6c31b"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5ff96da263740779b0893d02b718293cc03400c3a208fc8d8cd79d9b0993e532"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:3d375a19ba2bfe320b6d873f3fb165313b002cef8b7cc0a368ad8b8a57453837"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:62c7da0ad93a07da048b500514ca47b759459ec41924143e2ddb5d7e20fd3db5"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:147b0fcd0ee33b4b5f6edfea80452d80e419e51b9a3f7a96ce98eaee145c1581"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:504e1fe1cc4f170195320eb033d2b0ccf5c6114ce5bf2f617535c01699479bca"},
+    {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bdcf667a5dec12a48f669e485d70c54189f0639c2157b538a4cffd24a853624f"},
+    {file = "yarl-1.16.0-cp39-cp39-win32.whl", hash = "sha256:e9951afe6557c75a71045148890052cb942689ee4c9ec29f5436240e1fcc73b7"},
+    {file = "yarl-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d7aaa8ff95d0840e289423e7dc35696c2b058d635f945bf05b5cd633146b027"},
+    {file = "yarl-1.16.0-py3-none-any.whl", hash = "sha256:e6980a558d8461230c457218bd6c92dfc1d10205548215c2c21d79dc8d0a96f3"},
+    {file = "yarl-1.16.0.tar.gz", hash = "sha256:b6f687ced5510a9a2474bbae96a4352e5ace5fa34dc44a217b0537fec1db00b4"},
 ]
 
 [package.dependencies]
 idna = ">=2.0"
 multidict = ">=4.0"
+propcache = ">=0.2.0"
 
 [[package]]
 name = "zipp"
@@ -3833,4 +3966,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "be5965e4e13fed32347983dba1690661f3e59a50d59fa26d4f2a8345418dd5a1"
+content-hash = "500fa44255e4a6c89a16314a931548447afe1ba71ea341a73cad6670e46ddac7"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 6ea4718d715..d08d0b8f488 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -41,10 +41,10 @@ py-cpuinfo = "^9.0.0"
 numpy = "^1.26"
 
 marlin-kernels = [
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
-  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true },
+  { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true },
 ]
 moe-kernels = [
   { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true },
diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt
index 5de75b6b3e3..e3f6d20f8c7 100644
--- a/server/requirements_cuda.txt
+++ b/server/requirements_cuda.txt
@@ -1,5 +1,5 @@
 certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
@@ -10,7 +10,7 @@ googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version <
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.66.1 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
@@ -38,14 +38,14 @@ pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
 regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.8.1 ; python_version >= "3.9" and python_version < "3.13"
+rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.1.0 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.0 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt
index 5de75b6b3e3..e3f6d20f8c7 100644
--- a/server/requirements_intel.txt
+++ b/server/requirements_intel.txt
@@ -1,5 +1,5 @@
 certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
@@ -10,7 +10,7 @@ googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version <
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.66.1 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
@@ -38,14 +38,14 @@ pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
 regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.8.1 ; python_version >= "3.9" and python_version < "3.13"
+rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.1.0 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.0 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt
index 5de75b6b3e3..e3f6d20f8c7 100644
--- a/server/requirements_rocm.txt
+++ b/server/requirements_rocm.txt
@@ -1,5 +1,5 @@
 certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13"
-charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13"
+charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13"
 click==8.1.7 ; python_version >= "3.9" and python_version < "3.13"
 colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows")
 deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13"
@@ -10,7 +10,7 @@ googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version <
 grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
 grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13"
-grpcio==1.66.1 ; python_version >= "3.9" and python_version < "3.13"
+grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13"
 hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13"
 huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13"
 idna==3.10 ; python_version >= "3.9" and python_version < "3.13"
@@ -38,14 +38,14 @@ pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13"
 pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13"
 regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13"
 requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13"
-rich==13.8.1 ; python_version >= "3.9" and python_version < "3.13"
+rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13"
 safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13"
 scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13"
-setuptools==75.1.0 ; python_version >= "3.9" and python_version < "3.13"
-tokenizers==0.20.0 ; python_version >= "3.9" and python_version < "3.13"
+setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13"
+tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.45.0 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13"
 typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13"
 urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13"
diff --git a/server/tests/conftest.py b/server/tests/conftest.py
index d99771f8ad0..9822279214f 100644
--- a/server/tests/conftest.py
+++ b/server/tests/conftest.py
@@ -2,7 +2,7 @@
 import os
 from text_generation_server.pb import generate_pb2
 
-os.environ["USE_PREFIX_CACHING"] = "1"
+os.environ["PREFIX_CACHING"] = "1"
 os.environ["ATTENTION"] = "flashinfer"
 
 
diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py
index db390234e43..a363b33a89a 100644
--- a/server/text_generation_server/cli.py
+++ b/server/text_generation_server/cli.py
@@ -31,6 +31,7 @@ class Dtype(str, Enum):
 
 
 class KVCacheDtype(str, Enum):
+    fp8_e4m3fn = "fp8_e4m3fn"
     fp8_e5m2 = "fp8_e5m2"
 
 
diff --git a/server/text_generation_server/interceptor.py b/server/text_generation_server/interceptor.py
index 57df172575a..a5c023e4ecf 100644
--- a/server/text_generation_server/interceptor.py
+++ b/server/text_generation_server/interceptor.py
@@ -9,6 +9,9 @@
 
 
 class ExceptionInterceptor(AsyncServerInterceptor):
+    def __init__(self, shutdown_callback):
+        self.shutdown_callback = shutdown_callback
+
     async def intercept(
         self,
         method: Callable,
@@ -25,7 +28,7 @@ async def intercept(
 
             # Runtime Error cannot be recovered from
             if isinstance(err, RuntimeError):
-                exit(1)
+                self.shutdown_callback()
 
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
diff --git a/server/text_generation_server/layers/attention/__init__.py b/server/text_generation_server/layers/attention/__init__.py
index cc7f0caaad9..ebe32042c46 100644
--- a/server/text_generation_server/layers/attention/__init__.py
+++ b/server/text_generation_server/layers/attention/__init__.py
@@ -8,39 +8,32 @@
     raise ImportError("`USE_FLASH_ATTENTION` is false.")
 if SYSTEM == "cuda":
     from .cuda import (
-        PREFILL_IN_KV_CACHE,
         SUPPORTS_WINDOWING,
         attention,
         paged_attention,
-        reshape_and_cache,
     )
 elif SYSTEM == "rocm":
     from .rocm import (
-        PREFILL_IN_KV_CACHE,
         SUPPORTS_WINDOWING,
         attention,
         paged_attention,
-        reshape_and_cache,
     )
 elif SYSTEM == "ipex":
     from .ipex import (
-        PREFILL_IN_KV_CACHE,
         SUPPORTS_WINDOWING,
         attention,
         paged_attention,
-        reshape_and_cache,
     )
 else:
     raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention")
 
 # KVCache needs `reshape_and_cache`, so ensure that it is defined already.
-from .kv_cache import KVCache
+from .kv_cache import KVCache, get_kv_scales
 
 __all__ = [
     "attention",
+    "get_kv_scales",
     "paged_attention",
-    "reshape_and_cache",
-    "PREFILL_IN_KV_CACHE",
     "SUPPORTS_WINDOWING",
     "KVCache",
     "Seqlen",
diff --git a/server/text_generation_server/layers/attention/common.py b/server/text_generation_server/layers/attention/common.py
index c8ac0c2ab95..a3b919ee4fd 100644
--- a/server/text_generation_server/layers/attention/common.py
+++ b/server/text_generation_server/layers/attention/common.py
@@ -1,72 +1,52 @@
 from dataclasses import dataclass
-from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import ATTENTION
 import torch
 from typing import Optional
 
 
-if ATTENTION in {"flashinfer", "flashdecoding"}:
-
-    @dataclass
-    class Seqlen:
-        input_lengths: torch.Tensor
-        prefix_lengths: torch.Tensor
-        cu_seqlen_q: Optional[torch.Tensor]
-        cu_seqlen_k: Optional[torch.Tensor]
-        max_q: int
-        max_k: int
-
-        def __init__(
-            self,
-            input_lengths,
-            prefix_lengths,
-            cu_seqlen_q=None,
-            max_q=None,
-            max_k=None,
-        ):
-            self.input_lengths = input_lengths
-            self.prefix_lengths = prefix_lengths
-            device = self.input_lengths.device
-            shape = self.input_lengths.shape
-            if cu_seqlen_q is None:
-                cu_seqlen_q = torch.arange(
-                    shape[0] + 1,
-                    device=device,
-                    dtype=torch.int32,
-                )
-                max_q = 1
-            else:
-                assert max_q is not None
-            assert max_k is not None
-            cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
-
-            # cuda graphs don't like this and this is necessary to clamp within mistral
-            # Although FA2 might not want the clamping
-            # cu_seqlen_k[0] = 0
-            total = self.input_lengths + self.prefix_lengths
-            torch.cumsum(total, -1, out=cu_seqlen_k[1:])
-
-            self.cu_seqlen_q = cu_seqlen_q
-            self.cu_seqlen_k = cu_seqlen_k
-            self.max_q = max_q
-            self.max_k = max_k
-
-        def clamp(self, max):
-            # Flash decoding doesn't need to clamp
-            return self
-
-else:
-
-    @dataclass
-    class Seqlen:
-        input_lengths: torch.Tensor
-        prefix_lengths: torch.Tensor
-        cu_seqlen_q: torch.Tensor
-        max_q: int
-        max_k: int
-
-        def clamp(self, max):
-            if SYSTEM == "rocm":
-                return self
-            self.input_lengths = torch.clamp(self.input_lengths, max=max)
-            return self
+@dataclass
+class Seqlen:
+    input_lengths: torch.Tensor
+    cache_lengths: torch.Tensor
+    cu_seqlen_q: Optional[torch.Tensor]
+    cu_seqlen_k: Optional[torch.Tensor]
+    max_q: int
+    max_k: int
+
+    def __init__(
+        self,
+        input_lengths,
+        cache_lengths,
+        cu_seqlen_q=None,
+        max_q=None,
+        max_k=None,
+    ):
+        self.input_lengths = input_lengths
+        self.cache_lengths = cache_lengths
+        device = self.input_lengths.device
+        shape = self.input_lengths.shape
+        if cu_seqlen_q is None:
+            cu_seqlen_q = torch.arange(
+                shape[0] + 1,
+                device=device,
+                dtype=torch.int32,
+            )
+            max_q = 1
+        else:
+            assert max_q is not None
+        assert max_k is not None
+        cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32)
+
+        # cuda graphs don't like this and this is necessary to clamp within mistral
+        # Although FA2 might not want the clamping
+        # cu_seqlen_k[0] = 0
+        total = self.input_lengths + self.cache_lengths
+        torch.cumsum(total, -1, out=cu_seqlen_k[1:])
+
+        self.cu_seqlen_q = cu_seqlen_q
+        self.cu_seqlen_k = cu_seqlen_k
+        self.max_q = max_q
+        self.max_k = max_k
+
+    def clamp(self, max):
+        # Flash decoding doesn't need to clamp
+        return self
diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py
index cd3ea369b61..d705afb0bd2 100644
--- a/server/text_generation_server/layers/attention/cuda.py
+++ b/server/text_generation_server/layers/attention/cuda.py
@@ -1,4 +1,5 @@
 import torch
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models.globals import (
     ATTENTION,
@@ -7,44 +8,22 @@
 from text_generation_server.layers.attention import Seqlen
 from typing import Optional
 
+
 major, minor = torch.cuda.get_device_capability()
 is_sm75 = major == 7 and minor == 5
 _PARTITION_SIZE = 512
 
-try:
-    from vllm._C import cache_ops
-except Exception as e:
-    raise ImportError(
-        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
-    )
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if ATTENTION in {"flashdecoding", "flashinfer"}:
-        shape = key_cache.shape
-        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
-        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
-    else:
-        cache_ops.reshape_and_cache(
-            key, value, key_cache, value_cache, slots, "auto", 1.0
-        )
-
 
 def paged_attention(
     query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: KVCache,
     kv_head_mapping: torch.Tensor,
     softmax_scale: float,
     block_tables: torch.Tensor,
     seqlen: Seqlen,
     max_s: int,
+    *,
+    kv_scales: KVScales,
     softcap: Optional[float] = None,
 ):
     # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
@@ -70,6 +49,8 @@ def paged_attention(
     num_seqs, num_heads, head_size = query.shape
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
 
+    can_scale = kv_cache.can_scale(kv_scales)
+
     # NOTE(woosuk): We use a simple heuristic to decide whether to use
     # PagedAttention V1 or V2. If the number of partitions is 1, we use
     # V1 to avoid the overhead of reduction. Also, if the number of
@@ -79,10 +60,13 @@ def paged_attention(
         from text_generation_server.layers.attention.flashinfer import decode_state
 
         return decode_state.get().forward(
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
             query.contiguous(),
-            paged_kv_cache=(key_cache, value_cache),
+            paged_kv_cache=(kv_cache.key, kv_cache.value),
             logits_soft_cap=softcap,
             sm_scale=softmax_scale,
+            k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
+            v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
         )
     elif ATTENTION == "flashdecoding":
         max_q = 1
@@ -98,8 +82,8 @@ def paged_attention(
             softcap = 0.0
         out = flash_attn_2_cuda.varlen_fwd(
             query,
-            key_cache,
-            value_cache,
+            kv_cache.key,
+            kv_cache.value,
             None,
             seqlen.cu_seqlen_q,
             seqlen.cu_seqlen_k,
@@ -123,7 +107,7 @@ def paged_attention(
     else:
         if softcap is not None:
             raise RuntimeError("Paged attention doesn't support softcapping")
-        input_lengths = seqlen.input_lengths
+        input_lengths = seqlen.input_lengths + seqlen.cache_lengths
         from vllm._C import ops
 
         out = torch.empty_like(query)
@@ -135,8 +119,8 @@ def paged_attention(
             ops.paged_attention_v1(
                 out,
                 query,
-                key_cache,
-                value_cache,
+                kv_cache.key,
+                kv_cache.value,
                 kv_head_mapping,
                 softmax_scale,
                 block_tables,
@@ -168,8 +152,8 @@ def paged_attention(
                 max_logits,
                 tmp_output,
                 query,
-                key_cache,
-                value_cache,
+                kv_cache.key,
+                kv_cache.value,
                 kv_head_mapping,
                 softmax_scale,
                 block_tables,
@@ -216,60 +200,69 @@ def paged_attention(
             ) from e
 
 
+if ATTENTION == "flashdecoding" and not V2:
+    raise ValueError("Flash decoding requires Flash Attention V2")
+
 SUPPORTS_WINDOWING = V2
 
-if ATTENTION == "flashinfer":
-
-    def attention(
-        q: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale,
-        window_size_left=-1,
-        causal=True,
-        softcap=0.0,
-    ):
+
+def attention(
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
+    softcap: Optional[float] = None,
+):
+    can_scale = kv_cache.can_scale(kv_scales)
+
+    if ATTENTION == "flashinfer":
         from text_generation_server.layers.attention.flashinfer import (
             prefill_with_paged_kv_state,
         )
 
+        if softcap is None:
+            softcap = 0.0
+
         return prefill_with_paged_kv_state.get().forward(
-            q.contiguous(),
+            # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged.
+            query.contiguous(),
             causal=causal,
-            paged_kv_cache=(key_cache, value_cache),
+            paged_kv_cache=(kv_cache.key, kv_cache.value),
             logits_soft_cap=softcap,
             sm_scale=softmax_scale,
             window_left=window_size_left,
+            k_scale=kv_scales.key_scale_cpu if can_scale else 1.0,
+            v_scale=kv_scales.value_scale_cpu if can_scale else 1.0,
         )
 
-elif V2:
-
-    def attention(
-        q,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale,
-        window_size_left=-1,
-        causal=True,
-        softcap=0.0,
-    ):
-        out = torch.empty_like(q)
+    # If we are using flashdecoding or paged, we always use flash-attn for
+    # the prefill. We have to branch on whether we use flash-attn v1 or v2.
+    elif V2:
+        out = torch.empty_like(query)
         if window_size_left <= 0 and window_size_left != -1:
             raise ValueError("`window_size_left` must be > 0 or -1")
+
+        if softcap is None:
+            softcap = 0.0
+
         return flash_attn_2_cuda.varlen_fwd(
-            q,
-            key_cache,
-            value_cache,
+            query,
+            # flashdecoding: pass the KV caches, paged: pass the KV.
+            kv_cache.key if ATTENTION == "flashdecoding" else key,
+            kv_cache.value if ATTENTION == "flashdecoding" else value,
             out,
             seqlen.cu_seqlen_q,
             seqlen.cu_seqlen_k,
             None,
             None,
-            block_tables,
+            block_tables if ATTENTION == "flashdecoding" else None,
             None,
             seqlen.max_q,
             seqlen.max_k,
@@ -284,57 +277,45 @@ def attention(
             None,
         )[0]
 
-else:
-
-    def attention(
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale: float,
-        window_size_left: int = -1,
-        causal: bool = True,
-        softcap=None,
-    ):
+    else:
         if window_size_left != -1:
             raise NotImplementedError(
                 "window_size_left is only available with flash attn v2"
             )
         if softcap is not None:
-            raise NotImplementedError("softcap is only available with flash attn v2")
+            raise NotImplementedError("softcap is not available in flash attn v1")
 
         # Flash attention v1 requires q, k and v to have the same number of heads
-        if k.shape[1] != q.shape[1]:
+        if key.shape[1] != query.shape[1]:
             # MQA expand
-            if k.shape[1] == 1:
-                k = k.expand(-1, q.shape[1], -1)
+            if key.shape[1] == 1:
+                key = key.expand(-1, query.shape[1], -1)
             # Grouped attention reshape
             else:
-                original_shape = k.shape
-                k = (
-                    k.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // k.shape[1], -1)
+                original_shape = key.shape
+                key = (
+                    key.unsqueeze(2)
+                    .expand(-1, -1, query.shape[1] // key.shape[1], -1)
                     .reshape(original_shape[0], -1, original_shape[2])
                 )
-        if v.shape[1] != q.shape[1]:
+        if value.shape[1] != query.shape[1]:
             # MQA expand
-            if v.shape[1] == 1:
-                v = v.expand(-1, q.shape[1], -1)
+            if value.shape[1] == 1:
+                value = value.expand(-1, query.shape[1], -1)
             # Grouped attention reshape
             else:
-                original_shape = v.shape
-                v = (
-                    v.unsqueeze(2)
-                    .expand(-1, -1, q.shape[1] // v.shape[1], -1)
+                original_shape = value.shape
+                value = (
+                    value.unsqueeze(2)
+                    .expand(-1, -1, query.shape[1] // value.shape[1], -1)
                     .reshape(original_shape[0], -1, original_shape[2])
                 )
 
-        out = torch.empty_like(q)
+        out = torch.empty_like(query)
         flash_attn_cuda.fwd(
-            q,
-            k,
-            v,
+            query,
+            key,
+            value,
             out,
             seqlen.cu_seqlen_q,
             seqlen.cu_seqlen_q,
@@ -351,15 +332,8 @@ def attention(
         return out
 
 
-# Prefill in the cache with every kind of attention, unless we
-# have a configuration that requires flash-attention v1, which
-# does not support block tables.
-PREFILL_IN_KV_CACHE = ATTENTION != "paged" or V2
-
 __all__ = [
-    "PREFILL_IN_KV_CACHE",
     "SUPPORTS_WINDOWING",
     "attention",
     "paged_attention",
-    "reshape_and_cache",
 ]
diff --git a/server/text_generation_server/layers/attention/flash_attn_triton.py b/server/text_generation_server/layers/attention/flash_attn_triton.py
index 3a6f9a730ce..fd180f0f19a 100644
--- a/server/text_generation_server/layers/attention/flash_attn_triton.py
+++ b/server/text_generation_server/layers/attention/flash_attn_triton.py
@@ -699,7 +699,6 @@ def check_args(
 
 
 class _attention(torch.autograd.Function):
-
     @staticmethod
     def forward(
         ctx,
diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py
index d603c6f5f03..26a72d9be71 100644
--- a/server/text_generation_server/layers/attention/flashinfer.py
+++ b/server/text_generation_server/layers/attention/flashinfer.py
@@ -204,6 +204,7 @@ def use_decode_state(
     num_kv_heads: int,
     head_size: int,
     page_size: int,
+    kv_cache_dtype: torch.dtype,
     dtype: torch.dtype,
     window_left: int,
 ):
@@ -240,7 +241,7 @@ def use_decode_state(
             num_kv_heads=num_kv_heads,
             head_dim=head_size,
             page_size=page_size,
-            data_type=dtype,
+            data_type=kv_cache_dtype,
             q_data_type=dtype,
             window_left=window_left,
         )
diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py
index 131c9bb0644..677f3f5647d 100644
--- a/server/text_generation_server/layers/attention/ipex.py
+++ b/server/text_generation_server/layers/attention/ipex.py
@@ -1,31 +1,37 @@
 import intel_extension_for_pytorch as ipex
 import torch
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.models.flash_causal_lm import BLOCK_SIZE
 from text_generation_server.layers.attention import Seqlen
 from typing import Optional
 
 SUPPORTS_WINDOWING = False
-PREFILL_IN_KV_CACHE = False
 
 
 def attention(
-    q: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
     seqlen: Seqlen,
     block_tables: torch.Tensor,
-    softmax_scale,
-    window_size_left=-1,
-    causal=True,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
     softcap: Optional[float] = None,
 ):
-    out = torch.empty_like(q)
+    if softcap is not None:
+        raise NotImplementedError("softcap is not available in IPEX")
+
+    out = torch.empty_like(query)
 
     # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
     ipex.llm.functional.varlen_attention(
-        q.contiguous() if q.device.type == "xpu" else q,
-        key_cache.contiguous() if key_cache.device.type == "xpu" else key_cache,
-        value_cache.contiguous() if value_cache.device.type == "xpu" else value_cache,
+        query.contiguous() if query.device.type == "xpu" else query,
+        key.contiguous() if key.device.type == "xpu" else key,
+        value.contiguous() if value.device.type == "xpu" else value,
         out,
         seqlen.cu_seqlen_q,
         seqlen.cu_seqlen_q,
@@ -42,39 +48,32 @@ def attention(
     return out
 
 
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    ipex.llm.modules.PagedAttention.reshape_and_cache(
-        key, value, key_cache, value_cache, slots
-    )
-
-
 def paged_attention(
     query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: KVCache,
     kv_head_mapping: torch.Tensor,
     softmax_scale: float,
     block_tables: torch.Tensor,
     seqlen: Seqlen,
     max_s: int,
+    *,
+    kv_scales: KVScales,
     softcap: Optional[float] = None,
 ):
+    if softcap is not None:
+        raise NotImplementedError("softcap is not available in IPEX")
+
     out = torch.empty_like(query)
+    input_lengths = seqlen.input_lengths + seqlen.cache_lengths
     ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
         out,
         query,
-        key_cache,
-        value_cache,
+        kv_cache.key,
+        kv_cache.value,
         kv_head_mapping,
         softmax_scale,
         block_tables,
-        seqlen.input_lengths,
+        input_lengths,
         BLOCK_SIZE,
         max_s,
         None,
@@ -83,9 +82,7 @@ def paged_attention(
 
 
 __all__ = [
-    "PREFILL_IN_KV_CACHE",
     "SUPPORTS_WINDOWING",
     "attention",
     "paged_attention",
-    "reshape_and_cache",
 ]
diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py
index 3960c954985..9d739da5ee1 100644
--- a/server/text_generation_server/layers/attention/kv_cache.py
+++ b/server/text_generation_server/layers/attention/kv_cache.py
@@ -1,9 +1,38 @@
 from typing import Tuple
+from dataclasses import dataclass, field
 
+from loguru import logger
 import torch
+
+from text_generation_server.layers.fp8 import fp8_quantize
 from text_generation_server.models.globals import ATTENTION, BLOCK_SIZE
 from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.layers.attention import reshape_and_cache
+from text_generation_server.utils.log import log_once
+from text_generation_server.utils.weights import Weights
+
+
+@dataclass
+class KVScales:
+    """
+    Key-value scales for FP8 KV cache.
+
+    This data class stores key and value scales both as a GPU tensor and
+    as a GPU float. This inconvenience is necessary because some functions
+    (e.g. scaling kernels) take scales as a GPU tensor, whereas others
+    (e.g. flashinfer) take scales as a CPU scalar.
+    """
+
+    key_scale: torch.Tensor
+    value_scale: torch.Tensor
+    key_scale_cpu: float = field(init=False)
+    value_scale_cpu: float = field(init=False)
+
+    def __post_init__(self):
+        if self.key_scale.numel() != 1 or self.value_scale.numel() != 1:
+            raise ValueError("Key and value scales must be scalar tensors.")
+
+        self.key_scale_cpu = self.key_scale.item()
+        self.value_scale_cpu = self.value_scale.item()
 
 
 class KVCache:
@@ -24,11 +53,11 @@ def __init__(
     ):
         """Construct the key-value cache for a layer."""
 
-        if dtype == torch.float8_e5m2 and (
+        if dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and (
             ATTENTION != "flashinfer" or SYSTEM != "cuda"
         ):
             raise ValueError(
-                "float8_e5m2 KV cache is currently only supported for flashinfer on CUDA"
+                "FP8 KV cache is currently only supported for flashinfer on CUDA"
             )
 
         element_size = torch.tensor([], dtype=dtype).element_size()
@@ -77,6 +106,33 @@ def __init__(
                 ),
             )
 
+    def can_scale(self, kv_scales: KVScales) -> bool:
+        """Check if the cache can be scaled by the given scales."""
+        if kv_scales.key_scale_cpu == 1.0 and kv_scales.value_scale_cpu == 1.0:
+            return False
+        elif (
+            self.dtype == torch.float8_e4m3fn
+            and ATTENTION == "flashinfer"
+            and SYSTEM == "cuda"
+        ):
+            log_once(
+                logger.info,
+                "Using FP8 KV cache scales",
+            )
+            return True
+        else:
+            # We have scales, but not the correct FP8 cache type, so warn once.
+            log_once(
+                logger.info,
+                "Ignoring FP8 KV cache scales, only float8_e4m3fn KV cache on flashinfer is supported",
+            )
+            return False
+
+    @property
+    def dtype(self):
+        """Get the data type of the cache."""
+        return self.kv_cache[0].dtype
+
     @property
     def key(self):
         """Get the key cache."""
@@ -95,18 +151,34 @@ def store(
         key: torch.Tensor,
         value: torch.Tensor,
         slots: torch.Tensor,
+        kv_scales: KVScales,
     ):
         """Store the key and value at the given slots."""
 
         key_cache = self.kv_cache[0]
         value_cache = self.kv_cache[1]
 
+        if self.can_scale(kv_scales):
+            if kv_scales.key_scale_cpu != 1.0:
+                key = fp8_quantize(
+                    key.float(),
+                    scale=kv_scales.key_scale,
+                    qdtype=self.dtype,
+                    scalar=True,
+                )[0]
+            if kv_scales.value_scale_cpu != 1.0:
+                value = fp8_quantize(
+                    value.float(),
+                    scale=kv_scales.value_scale,
+                    qdtype=self.dtype,
+                    scalar=True,
+                )[0]
+
         if ATTENTION in {"flashdecoding", "flashinfer"}:
-            # TODO: add scale
             key = key.to(key_cache.dtype)
             value = value.to(value_cache.dtype)
-            if key_cache.dtype == torch.float8_e5m2:
-                # Torch index_put does not support float8_e5m2 yet, so
+            if key_cache.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}:
+                # Torch index_put does not support float8_{e5m2,e4m3fn} yet, so
                 # put as raw data instead.
                 key_cache = key_cache.view(torch.uint8)
                 value_cache = value_cache.view(torch.uint8)
@@ -116,4 +188,59 @@ def store(
             key_cache.view(-1, shape[-2], shape[-1])[slots] = key
             value_cache.view(-1, shape[-2], shape[-1])[slots] = value
         else:
-            reshape_and_cache(key, value, key_cache, value_cache, slots)
+            paged_reshape_and_cache(key, value, key_cache, value_cache, slots)
+
+
+def paged_reshape_and_cache(
+    key: torch.Tensor,
+    value: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    slots: torch.Tensor,
+):
+    if SYSTEM == "cuda":
+        try:
+            from vllm._C import cache_ops
+        except Exception as e:
+            raise ImportError(
+                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+            )
+        cache_ops.reshape_and_cache(
+            key, value, key_cache, value_cache, slots, "auto", 1.0
+        )
+    elif SYSTEM == "rocm":
+        try:
+            import vllm._custom_ops as ops
+        except Exception as e:
+            raise ImportError(
+                f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
+            )
+        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
+    elif SYSTEM == "ipex":
+        import intel_extension_for_pytorch as ipex
+
+        ipex.llm.modules.PagedAttention.reshape_and_cache(
+            key, value, key_cache, value_cache, slots
+        )
+    else:
+        raise NotImplementedError(
+            f"Cannot reshape and cache for paged attention, system '{SYSTEM}' not supported"
+        )
+
+
+def get_kv_scales(weights: Weights, prefix: str) -> KVScales:
+    """Load KV cache scales."""
+
+    key_scale = torch.tensor(1.0, dtype=torch.float32, device=weights.device)
+    value_scale = key_scale
+    if weights.has_tensor(f"{prefix}.k_scale") and weights.has_tensor(
+        f"{prefix}.v_scale"
+    ):
+        key_scale = weights.get_tensor(f"{prefix}.k_scale", to_dtype=False).float()
+        value_scale = weights.get_tensor(f"{prefix}.v_scale", to_dtype=False).float()
+    elif weights.has_tensor(f"{prefix}.kv_scale"):
+        # Fall back to older more coarse-grained scale when available.
+        key_scale = weights.get_tensor(f"{prefix}.kv_scale").float()
+        value_scale = key_scale
+
+    return KVScales(key_scale=key_scale, value_scale=value_scale)
diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py
index 01d4685a7ad..ea11c2c2615 100644
--- a/server/text_generation_server/layers/attention/rocm.py
+++ b/server/text_generation_server/layers/attention/rocm.py
@@ -1,8 +1,8 @@
 import os
 from typing import Optional
 import torch
+from text_generation_server.layers.attention.kv_cache import KVCache, KVScales
 from text_generation_server.utils.import_utils import SYSTEM
-from text_generation_server.models.globals import ATTENTION
 from text_generation_server.layers.attention import Seqlen
 from text_generation_server.utils.log import log_master
 from loguru import logger
@@ -16,8 +16,6 @@
 use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"}
 ENGINE = "triton" if use_triton else "ck"
 
-PREFILL_IN_KV_CACHE = False
-
 use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0"
 try:
     if use_rocm_custom_paged_attn:
@@ -29,38 +27,17 @@
     )
     use_rocm_custom_paged_attn = False
 
-try:
-    import vllm._custom_ops as ops
-except Exception as e:
-    raise ImportError(
-        f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
-    )
-
-
-def reshape_and_cache(
-    key: torch.Tensor,
-    value: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
-    slots: torch.Tensor,
-):
-    if ATTENTION == "flashdecoding":
-        shape = key_cache.shape
-        key_cache.view(-1, shape[-2], shape[-1])[slots] = key
-        value_cache.view(-1, shape[-2], shape[-1])[slots] = value
-    else:
-        ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0)
-
 
 def paged_attention(
     query: torch.Tensor,
-    key_cache: torch.Tensor,
-    value_cache: torch.Tensor,
+    kv_cache: KVCache,
     kv_head_mapping: torch.Tensor,
     softmax_scale: float,
     block_tables: torch.Tensor,
     seqlen: Seqlen,
     max_s: int,
+    *,
+    kv_scales: KVScales,
     softcap: Optional[float] = None,
 ):
     # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py
@@ -84,10 +61,10 @@ def paged_attention(
         raise RuntimeError("Paged attention doesn't support softcapping")
 
     # value_cache => [num_blocks, num_heads, head_size, block_size]
-    block_size = value_cache.shape[3]
+    block_size = kv_cache.value.shape[3]
     num_seqs, num_heads, head_size = query.shape
 
-    num_kv_heads = key_cache.shape[1]
+    num_kv_heads = kv_cache.key.shape[1]
     gqa_ratio = num_heads // num_kv_heads
     use_custom = (
         use_rocm_custom_paged_attn
@@ -104,7 +81,7 @@ def paged_attention(
         _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM
 
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
-    input_lengths = seqlen.input_lengths
+    input_lengths = seqlen.input_lengths + seqlen.cache_lengths
 
     out = torch.empty_like(query)
 
@@ -124,8 +101,8 @@ def paged_attention(
         ops.paged_attention_v1(
             out,
             query,
-            key_cache,
-            value_cache,
+            kv_cache.key,
+            kv_cache.value,
             kv_head_mapping,
             softmax_scale,
             block_tables,
@@ -158,8 +135,8 @@ def paged_attention(
                 max_logits,
                 tmp_output,
                 query,
-                key_cache,
-                value_cache,
+                kv_cache.key,
+                kv_cache.value,
                 kv_head_mapping,
                 softmax_scale,
                 block_tables,
@@ -177,8 +154,8 @@ def paged_attention(
                 max_logits,
                 tmp_output,
                 query,
-                key_cache,
-                value_cache,
+                kv_cache.key,
+                kv_cache.value,
                 num_kv_heads,
                 softmax_scale,
                 block_tables,
@@ -227,29 +204,36 @@ def paged_attention(
 
 
 SUPPORTS_WINDOWING = False
-if ENGINE == "ck":
-
-    def attention(
-        q,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale: float,
-        window_size_left: int = -1,
-        causal: bool = True,
-        softcap: float = 0.0,
-    ):
+
+
+def attention(
+    *,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: KVCache,
+    kv_scales: KVScales,
+    seqlen: Seqlen,
+    block_tables: torch.Tensor,
+    softmax_scale: float,
+    window_size_left: int = -1,
+    causal: bool = True,
+    softcap: Optional[float] = None,
+):
+    if ENGINE == "ck":
         if window_size_left <= 0 and window_size_left != -1:
             raise ValueError("`window_size_left` must be > 0 or -1")
 
-        out = torch.empty_like(q)
+        out = torch.empty_like(query)
+
+        if softcap is None:
+            softcap = 0.0
 
         # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
         return flash_attn_2_cuda.varlen_fwd(
-            q,
-            key_cache,
-            value_cache,
+            query,
+            key,
+            value,
             out,
             seqlen.cu_seqlen_q,
             seqlen.cu_seqlen_q,
@@ -270,30 +254,19 @@ def attention(
             None,
         )[0]
 
-elif ENGINE == "triton":
-    from .flash_attn_triton import triton_attention
-
-    def attention(
-        q,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        seqlen: Seqlen,
-        block_tables: torch.Tensor,
-        softmax_scale: float,
-        window_size_left: int = -1,
-        causal: bool = True,
-        softcap: Optional[float] = None,
-    ):
+    elif ENGINE == "triton":
+        from .flash_attn_triton import triton_attention
+
         if softcap is not None:
             raise NotImplementedError("softcap is only available with CK flash attn")
 
-        out = torch.empty_like(q)
+        out = torch.empty_like(query)
 
         # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load.
         output, _ = triton_attention(
-            q,
-            key_cache,
-            value_cache,
+            query,
+            key,
+            value,
             out,
             seqlen.cu_seqlen_q,
             seqlen.cu_seqlen_q,
@@ -304,13 +277,12 @@ def attention(
         )
         return output
 
-else:
-    raise RuntimeError(f"Unknown attention engine {ENGINE}")
+    else:
+        raise RuntimeError(f"Unknown attention engine {ENGINE}")
+
 
 __all__ = [
-    "PREFILL_IN_KV_CACHE",
     "SUPPORTS_WINDOWING",
     "attention",
     "paged_attention",
-    "reshape_and_cache",
 ]
diff --git a/server/text_generation_server/layers/awq/quantize/__init__.py b/server/text_generation_server/layers/awq/quantize/__init__.py
new file mode 100644
index 00000000000..3e72881bb2b
--- /dev/null
+++ b/server/text_generation_server/layers/awq/quantize/__init__.py
@@ -0,0 +1,8 @@
+from text_generation_server.utils.import_utils import SYSTEM
+
+if SYSTEM == "ipex":
+    from .ipex import WQLinear
+elif SYSTEM == "cuda":
+    from .cuda import WQLinear
+
+__all__ = ["WQLinear"]
diff --git a/server/text_generation_server/layers/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/cuda.py
similarity index 100%
rename from server/text_generation_server/layers/awq/quantize/qmodule.py
rename to server/text_generation_server/layers/awq/quantize/cuda.py
diff --git a/server/text_generation_server/layers/awq/quantize/ipex.py b/server/text_generation_server/layers/awq/quantize/ipex.py
new file mode 100644
index 00000000000..84cd7a2190d
--- /dev/null
+++ b/server/text_generation_server/layers/awq/quantize/ipex.py
@@ -0,0 +1,48 @@
+from typing import Optional
+import torch
+import torch.nn as nn
+import intel_extension_for_pytorch as ipex
+
+
+class WQLinear(nn.Module):
+    def __init__(
+        self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor]
+    ):
+        super().__init__()
+
+        if w_bit not in [4]:
+            raise NotImplementedError("Only 4-bit are supported for now.")
+
+        self.in_features = qweight.shape[0]
+        self.out_features = qweight.shape[1] * 32 // w_bit
+
+        self.w_bit = w_bit
+        self.group_size = group_size if group_size != -1 else self.in_features
+        # quick sanity check (make sure aligment)
+        assert self.in_features % self.group_size == 0
+        assert self.out_features % (32 // self.w_bit) == 0
+
+        self.qweight = qweight
+        self.qzeros = qzeros
+        self.scales = scales
+        self.bias = bias
+        self.woq_linear = (
+            ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(
+                self.qweight,
+                self.scales,
+                self.qzeros,
+                self.in_features,
+                self.out_features,
+                bias=self.bias,
+                group_size=self.group_size,
+                quant_method=ipex.llm.quantization.QuantMethod.AWQ_GEMM,
+                dtype=ipex.llm.quantization.QuantDtype.INT4,
+            )
+        )
+
+    @torch.no_grad()
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.out_features,)
+        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py
index 61dd5115157..a58c7f7b223 100644
--- a/server/text_generation_server/layers/fp8.py
+++ b/server/text_generation_server/layers/fp8.py
@@ -1,7 +1,7 @@
 import torch
 
 from dataclasses import dataclass
-from typing import Optional, Union, List
+from typing import Optional, Tuple, Union, List
 from loguru import logger
 
 from text_generation_server.utils.import_utils import SYSTEM
@@ -26,6 +26,12 @@ def is_fbgemm_gpu_available():
         return False
 
 
+try:
+    import marlin_kernels
+except ImportError:
+    marlin_kernels = None
+
+
 if is_fbgemm_gpu_available():
     if SYSTEM == "cuda":
         major, _ = torch.cuda.get_device_capability()
@@ -51,27 +57,82 @@ def get_fp8_linear() -> torch.nn.Module:
     return Fp8Linear
 
 
+def normalize_e4m3fn_to_e4m3fnuz(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    assert weight.dtype == torch.float8_e4m3fn
+    # The bits pattern 10000000(-128) represents zero in e4m3fn
+    # but NaN in e4m3fnuz. So here we set it to 0.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_as_int8 = weight.view(torch.int8)
+    ROCM_FP8_NAN_AS_INT = -128
+    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
+    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
+
+    # For the same bits representation, e4m3fnuz value is half of
+    # the e4m3fn value, so we should double the scaling factor to
+    # get the same dequantized value.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_scale = weight_scale * 2.0
+    if input_scale is not None:
+        input_scale = input_scale * 2.0
+    return weight, weight_scale, input_scale
+
+
 def fp8_quantize(
-    weight, scale_upper_bound=None, qdtype=torch.float8_e4m3fn, scalar=False
+    weight: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    scale_upper_bound: Optional[torch.Tensor] = None,
+    qdtype: torch.dtype = torch.float8_e4m3fn,
+    scalar: bool = False,
 ):
-    if FBGEMM_DYN_AVAILABLE and not scalar:
+    """
+    This function returns a reciprocal of the scale, so that a tensor can be unscaled
+    by multiplying it with the returned scale. If a scale is given through the `scale`
+    argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can
+    be used without modification).
+    """
+    if FBGEMM_DYN_AVAILABLE and not scalar and not scale:
         qweight, scale = torch.ops.fbgemm.quantize_fp8_per_row(
             weight, bs=None, scale_ub=scale_upper_bound, output_dtype=qdtype
         )
         return qweight, scale
 
+    if marlin_kernels is not None:
+        shape = weight.shape
+        qweight, scale = marlin_kernels.scaled_fp8_quant(
+            weight.reshape(-1, shape[-1]),
+            dtype=qdtype,
+            scale=scale,
+            scale_ub=scale_upper_bound,
+        )
+
+        return qweight.reshape(shape), scale
+
     # weight, scale = quant_weights(weight, torch.int8, False)
     finfo = torch.finfo(qdtype)
-    # Calculate the scale as dtype max divided by absmax
-    scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound)
-    # scale and clamp the tensor to bring it to
-    # the representative range of float8 data type
-    # (as default cast is unsaturated)
-    qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+
+    if scale is None:
+        # Calculate the scale as dtype max divided by absmax
+        scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound)
+        # scale and clamp the tensor to bring it to
+        # the representative range of float8 data type
+        # (as default cast is unsaturated)
+        qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max)
+        scale = scale.float().reciprocal()
+    else:
+        # Use reciprocal to avoid more expensive division.
+        qweight = (weight * scale.reciprocal()).clamp(min=finfo.min, max=finfo.max)
+
     # Return both float8 data and the inverse scale (as float),
     # as both required as inputs to torch._scaled_mm
     qweight = qweight.to(qdtype)
-    scale = scale.float().reciprocal()
+
+    if SYSTEM == "rocm":
+        qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(qweight, scale)
+
     return qweight, scale
 
 
@@ -92,9 +153,17 @@ def get_weights(self, weights: "Weights", prefix: str):
                 .reshape(-1)
                 .expand(w.shape[0])
             )
+
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                ).reshape(-1)
+
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -125,9 +194,24 @@ def get_weights_col_packed(
                 )
             scale = scale.reshape(-1).expand(w.shape[0])
 
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                )
+                if input_scale.numel() > 1:
+                    input_scale = weights.get_packed_sharded(
+                        f"{prefix}.input_scale",
+                        dim=0,
+                        block_sizes=block_sizes,
+                        to_dtype=False,
+                    )
+                input_scale = input_scale.reshape(-1).max()
+
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -154,9 +238,22 @@ def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: in
             ]
             scale = torch.cat(scale, dim=0).reshape(-1)
 
+            input_scale = [
+                _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape)
+                for p, shape in zip(prefixes, shapes)
+                if weights.has_tensor(f"{p}.input_scale")
+            ]
+            assert len(input_scale) == 0 or len(input_scale) == len(prefixes)
+            input_scale = (
+                torch.cat(input_scale, dim=0).reshape(-1).max()
+                if len(input_scale) != 0
+                else None
+            )
+
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -174,9 +271,16 @@ def get_weights_row(self, weights: "Weights", prefix: str):
                 .reshape(-1)
                 .expand(w.shape[0])
             )
+            input_scale = None
+            if weights.has_tensor(f"{prefix}.input_scale"):
+                input_scale = weights.get_tensor(
+                    f"{prefix}.input_scale", to_dtype=False
+                ).reshape(-1)
+
             return Fp8Weight(
                 weight=w,
                 weight_scale=scale,
+                input_scale=input_scale,
                 activation_scale_ub=self.activation_scale_ub,
                 dtype=weights.dtype,
             )
@@ -191,6 +295,7 @@ class Fp8Weight(Weight):
     weight: torch.Tensor
     dtype: torch.dtype
     weight_scale: Optional[torch.Tensor] = None
+    input_scale: Optional[torch.Tensor] = None
     activation_scale_ub: Optional[float] = None
 
     def get_linear(self, bias: torch.Tensor):
@@ -200,33 +305,50 @@ def get_linear(self, bias: torch.Tensor):
         # memory. Can be non-contiguous when we e.g. expand from scalars.
         self.weight_scale = self.weight_scale.contiguous()
         return get_fp8_linear().from_fp8(
-            self.weight, self.weight_scale, self.activation_scale_ub, bias, self.dtype
+            weight=self.weight,
+            scale=self.weight_scale,
+            dtype=self.dtype,
+            bias=bias,
+            input_scale=self.input_scale,
+            scale_upper_bound=self.activation_scale_ub,
         )
 
 
 class Fp8Linear(torch.nn.Module):
+    _device_identity_cache = {}
+
     def __init__(
         self,
-        qweight,
-        scale,
-        scale_upper_bound,
-        bias,
-        dtype,
+        qweight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        scale_upper_bound: Optional[float] = None,
     ) -> None:
         super().__init__()
         if FBGEMM_MM_AVAILABLE:
             log_once(logger.info, "Using FBGEMM fp8 optimized kernels")
+        if SYSTEM == "rocm" and qweight.dtype == torch.float8_e4m3fn:
+            qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                weight=qweight, weight_scale=scale
+            )
 
         self.dtype = dtype
         self.qweight = qweight
-        self.scale = scale
-        self.scale_upper_bound = (
-            torch.tensor(
-                [scale_upper_bound], dtype=torch.float32, device=qweight.device
+        self.scale = scale.float()
+        self.input_scale = input_scale.float() if input_scale is not None else None
+
+        if FBGEMM_MM_AVAILABLE:
+            self.scale_upper_bound = (
+                torch.tensor(
+                    [scale_upper_bound], dtype=torch.float32, device=qweight.device
+                )
+                if scale_upper_bound is not None
+                else None
             )
-            if scale_upper_bound is not None
-            else None
-        )
+        else:
+            self.scale_upper_bound = scale_upper_bound
 
         self.bias = bias if bias is not None else None
 
@@ -234,22 +356,46 @@ def __init__(
     def from_unquant(cls, weight, bias, dtype):
         qweight, scale = fp8_quantize(weight, scalar=not FBGEMM_MM_AVAILABLE)
         return cls(
-            qweight=qweight, scale=scale, scale_upper_bound=None, bias=bias, dtype=dtype
+            qweight=qweight,
+            scale=scale,
+            dtype=dtype,
+            bias=bias,
+            input_scale=None,
+            scale_upper_bound=None,
         )
 
     @classmethod
-    def from_fp8(cls, weight, scale, input_scale, bias, dtype):
+    def from_fp8(
+        cls,
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> "Fp8Linear":
+        input_scale = kwargs.get("input_scale", None)
+        scale_upper_bound = kwargs.get("scale_upper_bound", None)
+
         if FBGEMM_DYN_AVAILABLE:
             # fbgemm needs float32 scales.
             scale = scale.float()
         return cls(
             qweight=weight,
             scale=scale,
-            scale_upper_bound=input_scale,
+            input_scale=input_scale,
+            scale_upper_bound=scale_upper_bound,
             bias=bias,
             dtype=dtype,
         )
 
+    @classmethod
+    def get_shared_device_identity(cls, device):
+        # Input scaling factors are no longer optional in _scaled_mm starting
+        # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+        if device not in cls._device_identity_cache:
+            cls._device_identity_cache[device] = torch.ones(1, device=device)
+        return cls._device_identity_cache[device]
+
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         if FBGEMM_MM_AVAILABLE:
             qinput, scale = fp8_quantize(
@@ -266,15 +412,49 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
             )
             return y.to(self.dtype)
 
-        qinput, scale = fp8_quantize(input, scalar=True)
-        output, _ = torch._scaled_mm(
-            qinput,
-            self.qweight.t(),
-            out_dtype=self.dtype,
-            scale_a=scale,
-            scale_b=self.scale,
-            bias=self.bias,
+        qinput, scale = fp8_quantize(
+            input,
+            self.input_scale,
+            scale_upper_bound=self.scale_upper_bound,
+            scalar=True,
         )
+
+        per_tensor_weights = self.scale.numel() == 1
+        per_tensor_activations = scale.numel() == 1
+
+        if SYSTEM != "rocm" or (per_tensor_weights and per_tensor_activations):
+            output = torch._scaled_mm(
+                qinput,
+                self.qweight.t(),
+                out_dtype=self.dtype,
+                scale_a=scale,
+                scale_b=self.scale,
+                bias=self.bias,
+            )
+
+            if isinstance(output, tuple) and len(output) == 2:
+                output = output[0]
+        else:
+            device_identity = None
+            if SYSTEM == "rocm":
+                device_identity = self.get_shared_device_identity(self.qweight.device)
+
+            output = torch._scaled_mm(
+                qinput,
+                self.qweight.t(),
+                scale_a=device_identity,
+                scale_b=device_identity,
+                out_dtype=torch.float32,
+            )
+            if isinstance(output, tuple) and len(output) == 2:
+                output = output[0]
+
+            output = output * scale * self.scale.t()
+            if self.bias is not None:
+                output = output + self.bias
+
+            output = output.to(dtype=self.dtype)
+
         return output
 
 
diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py
index 505caa59a0d..7e838035214 100644
--- a/server/text_generation_server/layers/gptq/__init__.py
+++ b/server/text_generation_server/layers/gptq/__init__.py
@@ -8,6 +8,11 @@
 from text_generation_server.utils.log import log_once
 from text_generation_server.utils.weights import Weight, Weights, WeightsLoader
 
+if SYSTEM == "ipex":
+    from .ipex import QuantLinear
+elif SYSTEM in {"cuda", "rocm"}:
+    from .triton import QuantLinear
+
 
 @dataclass
 class GPTQWeight(Weight):
@@ -36,7 +41,7 @@ def get_linear(self, bias: torch.Tensor):
                     "to use Exllama/GPTQ kernels for AWQ inference."
                 )
             try:
-                from text_generation_server.layers.awq.quantize.qmodule import WQLinear
+                from text_generation_server.layers.awq.quantize import WQLinear
 
                 return WQLinear(
                     w_bit=self.bits,
@@ -60,8 +65,6 @@ def get_linear(self, bias: torch.Tensor):
 
             return ExllamaQuantLinear(self, bias)
         else:
-            from text_generation_server.layers.gptq.quant_linear import QuantLinear
-
             return QuantLinear(
                 self.qweight,
                 self.qzeros,
@@ -298,6 +301,7 @@ def get_weights_row(self, weights: Weights, prefix: str):
         self._get_gptq_params(weights)
 
         use_exllama = True
+        desc_act = self.desc_act
         if self.bits != 4:
             use_exllama = False
 
@@ -321,7 +325,8 @@ def get_weights_row(self, weights: Weights, prefix: str):
             if g_idx is not None:
                 if (
                     not torch.equal(
-                        g_idx.cpu(),
+                        # Remove g_idx[0] to adapt the check with TP>1.
+                        (g_idx - g_idx[0]).cpu(),
                         torch.tensor(
                             [i // self.groupsize for i in range(g_idx.shape[0])],
                             dtype=torch.int32,
@@ -332,6 +337,7 @@ def get_weights_row(self, weights: Weights, prefix: str):
                     # Exllama implementation does not support row tensor parallelism with act-order, as
                     # it would require to reorder input activations that are split unto several GPUs
                     use_exllama = False
+                    desc_act = True
 
         from text_generation_server.layers.gptq import (
             CAN_EXLLAMA,
@@ -350,16 +356,16 @@ def get_weights_row(self, weights: Weights, prefix: str):
             else:
                 log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}")
 
-        if use_exllama and self.groupsize != -1:
+        if not desc_act and self.groupsize != -1:
             qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0)
             scales = weights.get_sharded(f"{prefix}.scales", dim=0)
+            if g_idx is not None:
+                # qzeros, scales sharded, and g_idx must be adjusted accordingly
+                g_idx = g_idx - g_idx[0]
         else:
             qzeros = weights.get_tensor(f"{prefix}.qzeros")
             scales = weights.get_tensor(f"{prefix}.scales")
 
-        if use_exllama and g_idx is not None:
-            g_idx = g_idx - g_idx[0]
-
         if self.quantize == "gptq" and self.quant_method == "awq":
             log_once(
                 logger.info, "Converting AWQ model to Exllama/GPTQ packing format."
@@ -392,7 +398,7 @@ def get_weights_row(self, weights: Weights, prefix: str):
         )
 
     def _get_gptq_params(self, weights: Weights):
-        if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"):
+        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
             self.bits = weights.get_tensor("gptq_bits").item()
             self.groupsize = weights.get_tensor("gptq_groupsize").item()
             self.desc_act = False
@@ -400,7 +406,7 @@ def _get_gptq_params(self, weights: Weights):
             # before the `gptq_sym` setting tensor was added.
             self.sym = (
                 weights.get_tensor("gptq_sym").item()
-                if weights._has_tensor("gptq_sym")
+                if weights.has_tensor("gptq_sym")
                 else False
             )
             self.quant_method = "gptq"
diff --git a/server/text_generation_server/layers/gptq/ipex.py b/server/text_generation_server/layers/gptq/ipex.py
new file mode 100644
index 00000000000..ab9c9e24752
--- /dev/null
+++ b/server/text_generation_server/layers/gptq/ipex.py
@@ -0,0 +1,126 @@
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+
+import intel_extension_for_pytorch as ipex
+
+
+class QuantLinear(nn.Module):
+    def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize):
+        super().__init__()
+        self.register_buffer("qweight", qweight)
+        self.register_buffer("qzeros", qzeros)
+        self.register_buffer("scales", scales)
+        self.register_buffer("g_idx", g_idx)
+        if bias is not None:
+            self.register_buffer("bias", bias)
+        else:
+            self.bias = None
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+        self.bits = bits
+        self.maxq = 2**self.bits - 1
+        self.groupsize = groupsize
+
+        self.outfeatures = qweight.shape[1]
+        self.infeatures = qweight.shape[0] * 32 // bits
+        self.woq_linear = (
+            ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight(
+                self.qweight,
+                self.scales,
+                self.qzeros,
+                self.infeatures,
+                self.outfeatures,
+                bias=self.bias,
+                group_size=self.groupsize,
+                g_idx=g_idx,
+                quant_method=ipex.llm.quantization.QuantMethod.GPTQ_GEMM,
+                dtype=ipex.llm.quantization.QuantDtype.INT4,
+            )
+        )
+
+    @classmethod
+    def new(cls, bits, groupsize, infeatures, outfeatures, bias):
+        if bits not in [4]:
+            raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32)
+        qzeros = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures // 32 * bits),
+            dtype=torch.int32,
+        )
+        scales = torch.zeros(
+            (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16
+        )
+        g_idx = torch.tensor(
+            [i // groupsize for i in range(infeatures)], dtype=torch.int32
+        )
+        if bias:
+            bias = torch.zeros((outfeatures), dtype=torch.float16)
+        else:
+            bias = None
+        return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize)
+
+    def pack(self, linear, scales, zeros, g_idx=None):
+        self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx
+
+        scales = scales.t().contiguous()
+        zeros = zeros.t().contiguous()
+        scale_zeros = zeros * scales
+        self.scales = scales.clone().half()
+        if linear.bias is not None:
+            self.bias = linear.bias.clone().half()
+
+        intweight = []
+        for idx in range(self.infeatures):
+            intweight.append(
+                torch.round(
+                    (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]])
+                    / self.scales[self.g_idx[idx]]
+                ).to(torch.int)[:, None]
+            )
+        intweight = torch.cat(intweight, dim=1)
+        intweight = intweight.t().contiguous()
+        intweight = intweight.numpy().astype(np.uint32)
+        qweight = np.zeros(
+            (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32
+        )
+        i = 0
+        row = 0
+        while row < qweight.shape[0]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qweight[row] |= intweight[j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                row += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qweight = qweight.astype(np.int32)
+        self.qweight = torch.from_numpy(qweight)
+
+        zeros -= 1
+        zeros = zeros.numpy().astype(np.uint32)
+        qzeros = np.zeros(
+            (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32
+        )
+        i = 0
+        col = 0
+        while col < qzeros.shape[1]:
+            if self.bits in [4]:
+                for j in range(i, i + (32 // self.bits)):
+                    qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i))
+                i += 32 // self.bits
+                col += 1
+            else:
+                raise NotImplementedError("Only 4 bits are supported.")
+
+        qzeros = qzeros.astype(np.int32)
+        self.qzeros = torch.from_numpy(qzeros)
+
+    def forward(self, x):
+        out_shape = x.shape[:-1] + (self.outfeatures,)
+        out = self.woq_linear(x.reshape(-1, x.shape[-1]))
+        out = out + self.bias if self.bias is not None else out
+        return out.reshape(out_shape)
diff --git a/server/text_generation_server/layers/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py
index b0086ea08b3..66fc15ec0e3 100644
--- a/server/text_generation_server/layers/gptq/quantize.py
+++ b/server/text_generation_server/layers/gptq/quantize.py
@@ -12,7 +12,7 @@
 from accelerate import init_empty_weights
 from text_generation_server.utils import initialize_torch_distributed, Weights
 from text_generation_server.utils.hub import weight_files
-from text_generation_server.layers.gptq.quant_linear import QuantLinear
+from text_generation_server.layers.gptq import QuantLinear
 from loguru import logger
 from typing import Optional
 from text_generation_server.layers.gptq.utils import torch_snr_error
diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/triton.py
similarity index 100%
rename from server/text_generation_server/layers/gptq/quant_linear.py
rename to server/text_generation_server/layers/gptq/triton.py
diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py
index fe55a58a33e..49f5c480f3f 100644
--- a/server/text_generation_server/layers/marlin/fp8.py
+++ b/server/text_generation_server/layers/marlin/fp8.py
@@ -62,7 +62,14 @@ def from_unquant(cls, weight, bias, dtype):
         return cls(qweight=qweight, scales=scales.to(dtype), bias=bias)
 
     @classmethod
-    def from_fp8(cls, weight, scale, _input_scale, bias, dtype):
+    def from_fp8(
+        cls,
+        weight: torch.Tensor,
+        scale: torch.Tensor,
+        bias: torch.Tensor,
+        dtype: torch.dtype,
+        **kwargs,
+    ):
         return cls(qweight=weight, scales=scale.to(dtype), bias=bias)
 
     def forward(self, A: torch.Tensor) -> torch.Tensor:
diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py
index 7245431f3ab..47341c0f0eb 100644
--- a/server/text_generation_server/layers/marlin/gptq.py
+++ b/server/text_generation_server/layers/marlin/gptq.py
@@ -231,7 +231,7 @@ def get_weights_row(self, weights: Weights, prefix: str):
         )
 
     def _get_gptq_params(self, weights: Weights):
-        if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"):
+        if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"):
             self.bits = weights.get_tensor("gptq_bits").item()
             self.groupsize = weights.get_tensor("gptq_groupsize").item()
             self.desc_act = False
@@ -239,7 +239,7 @@ def _get_gptq_params(self, weights: Weights):
             # before the `gptq_sym` setting tensor was added.
             self.sym = (
                 weights.get_tensor("gptq_sym").item()
-                if weights._has_tensor("gptq_sym")
+                if weights.has_tensor("gptq_sym")
                 else False
             )
             self.quant_method = "gptq"
diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py
index 17eed976467..d30154083f5 100644
--- a/server/text_generation_server/models/__init__.py
+++ b/server/text_generation_server/models/__init__.py
@@ -195,6 +195,11 @@ class ModelType(enum.Enum):
         "name": "Phi 3",
         "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct",
     }
+    GRANITE = {
+        "type": "granite",
+        "name": "Granite",
+        "url": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct",
+    }
     GEMMA = {
         "type": "gemma",
         "name": "Gemma",
@@ -357,17 +362,32 @@ def get_model(
     compression_config = config_dict.get("compression_config", None)
     if quantization_config is not None and quantize is None:
         method = quantization_config.get("quant_method", None)
+        config_groups = quantization_config.get("config_groups", None)
         if method in {"gptq", "awq", "exl2"}:
             log_master(logger.info, f"Auto selecting quantization method {method}")
             quantize = method
-        elif method == "fbgemm_fp8":
+        elif method == "fbgemm_fp8" or method == "fp8":
             log_master(logger.info, "Auto selecting quantization method fp8")
             quantize = "fp8"
+        elif config_groups is not None:
+            # TODO: at some point we should probably fully parse the compression
+            # configuration to know which parameters are compressed.
+            for _, group in config_groups.items():
+                weights_config = group.get("weights")
+                if weights_config is not None:
+                    if (
+                        weights_config["type"] == "float"
+                        and weights_config["num_bits"] == 8
+                    ):
+                        log_master(
+                            logger.info, "Auto selecting quantization method fp8"
+                        )
+                        quantize = "fp8"
+                        break
         else:
             log_master(logger.warning, f"Unknown quantization method {method}")
     elif compression_config is not None:
-        # TODO: at some point we should probably fully parse the compression
-        # configuration to know which parameters are compressed.
+        # `compression_config` renamed to `quantization_config`; support retained for backward compatibility.
         config_groups = compression_config.get("config_groups")
         if config_groups is not None:
             for _, group in config_groups.items():
@@ -385,8 +405,11 @@ def get_model(
 
     if dtype is None:
         if quantize in ["awq", "exl2", "gptq", "marlin"]:
-            # These quantizers only work with float16 params.
-            dtype = torch.float16
+            if SYSTEM == "ipex" and not hasattr(torch, "xpu"):
+                dtype = torch.bfloat16
+            else:
+                # These quantizers only work with float16 params.
+                dtype = torch.float16
         elif quantize == "fp8":
             from text_generation_server.layers.fp8 import FBGEMM_DYN_AVAILABLE
 
@@ -406,6 +429,8 @@ def get_model(
 
     if kv_cache_dtype is None:
         kv_cache_dtype = dtype
+    elif kv_cache_dtype == "fp8_e4m3fn":
+        kv_cache_dtype = torch.float8_e4m3fn
     elif kv_cache_dtype == "fp8_e5m2":
         kv_cache_dtype = torch.float8_e5m2
     else:
@@ -842,7 +867,12 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
 
-    elif model_type == LLAMA or model_type == BAICHUAN or model_type == PHI3:
+    elif (
+        model_type == LLAMA
+        or model_type == BAICHUAN
+        or model_type == PHI3
+        or model_type == GRANITE
+    ):
         if FLASH_ATTENTION:
             return FlashCausalLM(
                 model_id=model_id,
@@ -856,7 +886,9 @@ def get_model(
                 lora_adapter_ids=lora_adapter_ids,
             )
         elif sharded:
-            raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Llama"))
+            raise NotImplementedError(
+                FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}")
+            )
         else:
             return CausalLM.fallback(
                 model_id,
diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py
index de2c065160b..bd8176be9ed 100644
--- a/server/text_generation_server/models/causal_lm.py
+++ b/server/text_generation_server/models/causal_lm.py
@@ -76,6 +76,7 @@ def to_pb(self) -> generate_pb2.CachedBatch:
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self.input_ids),
         )
 
     @classmethod
@@ -619,18 +620,11 @@ def fallback(
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map=(
-                "auto"
-                if device_count > 1
-                else None
-            ),
+            device_map=("auto" if device_count > 1 else None),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
-        if (
-            device_count == 1
-            and quantize != "bitsandbytes"
-        ):
+        if device_count == 1 and quantize != "bitsandbytes":
             model = model.to(device)
 
         if tokenizer.pad_token_id is None:
diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
index d0425fec11f..68719106fca 100644
--- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py
@@ -30,6 +30,7 @@
     attention,
     Seqlen,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -38,7 +39,6 @@
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -228,6 +228,7 @@ def __init__(
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.use_qk_norm = config.use_qk_norm
         if self.use_qk_norm:
@@ -290,30 +291,37 @@ def forward(
 
         self.rotary_emb(query, key, cos, sin)
 
-        kv_cache.store(key=key, value=value, slots=slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else key,
-                kv_cache.value if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(
diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
index b2b0cecbcca..f70bff4f881 100644
--- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py
@@ -20,6 +20,7 @@
 from transformers.activations import ACT2FN
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple, Any
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 
 if SYSTEM != "ipex":
@@ -29,7 +30,6 @@
     paged_attention,
     attention,
     Seqlen,
-    PREFILL_IN_KV_CACHE,
 )
 from text_generation_server.layers import (
     FastLinear,
@@ -289,6 +289,7 @@ def __init__(
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -329,30 +330,37 @@ def forward(
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
index af77af8e139..906a83a4151 100644
--- a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py
@@ -34,7 +34,7 @@
     attention,
     paged_attention,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales
 from text_generation_server.layers.layernorm import FastRMSNorm
 from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
 from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale
@@ -231,6 +231,8 @@ def __init__(
             ),
         )
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         self.kv_a_layernorm = FastRMSNorm.load(
             prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps
         )
@@ -259,7 +261,7 @@ def forward(
         cos: torch.Tensor,
         sin: torch.Tensor,
         cu_seqlen_prefill: torch.Tensor,
-        kv_cache: Tuple[torch.Tensor, torch.Tensor],
+        kv_cache: KVCache,
         block_tables: torch.Tensor,
         slots: torch.Tensor,
         seqlen: Seqlen,
@@ -320,30 +322,37 @@ def forward(
             value, (0, self.head_pad_size - self.value_head_size), value=0
         )
 
-        kv_cache.store(key=key, value=value, slots=slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else key,
-                kv_cache.value if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         # Remove padding.
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
index 03b9b2a0201..ebf1b80eb03 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py
@@ -39,7 +39,7 @@
     TensorParallelMultiAdapterLinear,
     TensorParallelAdapterRowLinear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -207,6 +207,7 @@ def __init__(
             ],
             process_group=weights.process_group,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         o_proj = TensorParallelRowLinear.load(
             config,
@@ -252,19 +253,25 @@ def forward(
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
-                causal=self.causal,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.window_size,
                 softcap=self.softcap,
             )
@@ -272,14 +279,14 @@ def forward(
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
                 softcap=self.softcap,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(
diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
index f3c469012d5..ad3be80e51d 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py
@@ -29,7 +29,6 @@
     paged_attention,
     attention,
     Seqlen,
-    PREFILL_IN_KV_CACHE,
 )
 from text_generation_server.layers import (
     TensorParallelRowLinear,
@@ -38,6 +37,7 @@
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -186,6 +186,7 @@ def __init__(self, prefix: str, config, weights, causal: bool):
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -223,31 +224,38 @@ def forward(
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 causal=self.causal,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
index 94a8898d0a8..906b34c12b2 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py
@@ -24,7 +24,6 @@
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.attention import (
     paged_attention,
     attention,
@@ -37,6 +36,7 @@
     SpeculativeHead,
     get_linear,
 )
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 
 
 def load_qkv(config, prefix: str, weights, head_size, num_heads):
@@ -194,6 +194,7 @@ def __init__(
             head_size=self.head_size,
             num_heads=self.num_heads,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = load_row(
             config,
@@ -223,30 +224,37 @@ def forward(
         key = key.view(-1, self.num_heads, self.head_size)
         value = value.view(-1, self.num_heads, self.head_size)
 
-        kv_cache.store(key=key, value=value, slots=slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else key,
-                kv_cache.value if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
index f0a1270e0e9..692f8ca31be 100644
--- a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py
@@ -24,6 +24,7 @@
 from torch import nn
 from transformers.activations import ACT2FN
 from typing import Optional, List, Tuple
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
     paged_attention,
@@ -37,7 +38,6 @@
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.rotary import (
     PositionRotaryEmbedding,
 )
@@ -139,6 +139,7 @@ def __init__(
             prefix=prefix,
             weights=weights,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = load_row(
             config,
@@ -185,30 +186,37 @@ def forward(
         else:
             self.rotary_emb(query, key, cos, sin)
 
-        kv_cache.store(key=key, value=value, slots=slots)
+        kv_cache.store(
+            key=key,
+            value=value,
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else key,
-                kv_cache.value if PREFILL_IN_KV_CACHE else value,
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key,
+                value=value,
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
index fbe45d79873..b26dd484942 100644
--- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py
@@ -27,7 +27,10 @@
 from torch import nn
 from transformers.activations import ACT2FN
 
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE, KVCache
+from text_generation_server.layers.attention import (
+    KVCache,
+    get_kv_scales,
+)
 from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
@@ -156,7 +159,10 @@ def __init__(
             device=weights.device,
         )
 
-        self.softmax_scale = self.head_size**-0.5
+        # `config.attention_multiplier` is used in Granite
+        self.softmax_scale = getattr(
+            config, "attention_multiplier", self.head_size**-0.5
+        )
 
         if self.num_heads % weights.process_group.size() != 0:
             raise ValueError(
@@ -176,11 +182,13 @@ def __init__(
         self.query_key_value = load_attention(config, prefix, weights, index)
         self.index = index
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
             weights=weights,
-            bias=False,
+            bias=getattr(config, "attention_bias", False),
         )
 
         self.o_proj = TensorParallelAdapterRowLinear.load(
@@ -221,30 +229,37 @@ def forward(
 
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_scales=self.kv_scales,
+                kv_cache=kv_cache,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(
@@ -436,6 +451,11 @@ def __init__(self, index, prefix, config, weights):
                 eps=config.rms_norm_eps,
             )
 
+        # Used in Granite
+        # This could eventually be baked into the weights like we do for the embeddings/lm_head
+        # but this would mean modifying the lora code
+        self.residual_multiplier = getattr(config, "residual_multiplier", None)
+
     def forward(
         self,
         hidden_states,
@@ -466,13 +486,16 @@ def forward(
             max_s,
             adapter_data,
         )
+        if self.residual_multiplier is not None:
+            attn_output *= self.residual_multiplier
 
-        # faster post attention rms norm
         normed_attn_res_output, attn_res = self.post_attention_layernorm(
             attn_output, res
         )
 
         mlp_output = self.dense(normed_attn_res_output, adapter_data)
+        if self.residual_multiplier is not None:
+            mlp_output *= self.residual_multiplier
 
         return mlp_output, attn_res
 
@@ -624,6 +647,11 @@ def __init__(self, prefix: str, config, weights):
         else:
             suffix = "lm_head"
 
+        # Used in Granite
+        embedding_multiplier = getattr(config, "embedding_multiplier", None)
+        if embedding_multiplier is not None:
+            self.embed_tokens.weight.data *= embedding_multiplier
+
         with no_fp8(weights):
             self.lm_head = SpeculativeHead.load(
                 config,
@@ -631,6 +659,16 @@ def __init__(self, prefix: str, config, weights):
                 weights=weights,
             )
 
+        # Used in Granite
+        self.logits_scaling = getattr(config, "logits_scaling", None)
+        if self.logits_scaling is not None and self.lm_head.head is not None:
+            try:
+                # Scale the weights directly
+                self.lm_head.head.linear.weight.data /= self.logits_scaling
+                self.logits_scaled = True
+            except Exception:
+                self.logits_scaled = False
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -664,4 +702,11 @@ def forward(
         if lm_head_indices is not None:
             hidden_states = hidden_states[lm_head_indices]
         logits, speculative_logits = self.lm_head(hidden_states)
+
+        # Used in Granite
+        if self.logits_scaling is not None and not self.logits_scaled:
+            logits /= self.logits_scaling
+            if speculative_logits is not None:
+                speculative_logits /= self.logits_scaling
+
         return logits, speculative_logits
diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
index 8974035eafc..c66c732f21d 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py
@@ -26,6 +26,7 @@
 from transformers.configuration_utils import PretrainedConfig
 from typing import Optional, List, Tuple
 
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.layers.attention import (
     paged_attention,
@@ -40,7 +41,6 @@
     TensorParallelMultiAdapterLinear,
     TensorParallelAdapterRowLinear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -159,6 +159,7 @@ def __init__(self, prefix: str, config, weights, layer_id):
             ],
             process_group=weights.process_group,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         o_proj = TensorParallelRowLinear.load(
             config,
@@ -209,31 +210,38 @@ def forward(
         else:
             kv_to_cache = kv
 
-        kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(
diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
index e7bc83208b0..a45dd1e615e 100644
--- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py
@@ -38,7 +38,7 @@
     attention,
     paged_attention,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import FastRMSNorm
 from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
@@ -214,6 +214,7 @@ def __init__(
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -257,31 +258,38 @@ def forward(
         else:
             kv_to_cache = kv
 
-        kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
index bcbea442683..2301b63cff2 100644
--- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py
@@ -38,7 +38,7 @@
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -131,6 +131,7 @@ def __init__(self, config, prefix, weights):
             head_size=self.head_size,
             hidden_size=self.hidden_size,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=True
         )
@@ -164,30 +165,37 @@ def forward(
         qkv[:, 0] = torch.cat((query_rot, query_pass), dim=-1)
         qkv[:, 1] = torch.cat((key_rot, key_pass), dim=-1)
 
-        kv_cache.store(key=qkv[:, 1], value=qkv[:, 2], slots=slots)
+        kv_cache.store(
+            key=qkv[:, 1],
+            value=qkv[:, 2],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                qkv[:, 0],
-                kv_cache.key if PREFILL_IN_KV_CACHE else qkv[:, 1],
-                kv_cache.value if PREFILL_IN_KV_CACHE else qkv[:, 2],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=qkv[:, 0],
+                key=qkv[:, 1],
+                value=qkv[:, 2],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 qkv[:, 0],
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
index cb7b6ee2e18..7382a7cb9fc 100644
--- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py
@@ -18,7 +18,7 @@
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
 )
@@ -138,6 +138,7 @@ def __init__(
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         # in llama the dense layer is called "o_proj" and has bias=False
         self.dense = TensorParallelRowLinear.load(
@@ -187,29 +188,36 @@ def forward(
         )
 
         # Reshape key and value and cache
-        kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_scales=self.kv_scales,
+                kv_cache=kv_cache,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
index 8185885fe7d..ab2a177db6a 100644
--- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py
@@ -16,7 +16,7 @@
     TensorParallelEmbedding,
     SpeculativeHead,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.layernorm import (
     FastRMSNorm,
@@ -85,6 +85,8 @@ def __init__(
 
         self.query_key_value = load_attention(config, prefix, weights)
 
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
+
         self.o_proj = TensorParallelRowLinear.load(
             config,
             prefix=f"{prefix}.o_proj",
@@ -127,31 +129,38 @@ def forward(
         else:
             kv_to_cache = kv
 
-        kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
index dac8ecf9507..2dcd1bf3033 100644
--- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py
@@ -12,7 +12,7 @@
     TensorParallelRowLinear,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import FastLayerNorm
 from text_generation_server.layers.rotary import PositionRotaryEmbedding
 from text_generation_server.layers.attention import (
@@ -159,6 +159,7 @@ def __init__(
             weights=weights,
             bias=config.bias,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
         )
@@ -199,30 +200,37 @@ def forward(
         # Inplace rotary
         self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin)
 
-        kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv[:, 0],
+            value=kv[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, 0],
+                value=kv[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(attn_output.view(-1, self.num_heads * self.head_size))
@@ -277,6 +285,7 @@ def __init__(
             weights=weights,
             bias=config.bias,
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.dense = load_row(
             config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias
         )
@@ -312,31 +321,36 @@ def forward(
         self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin)
 
         kv_cache.store(
-            key=kv[:, :, 0].contiguous(), value=kv[:, :, 1].contiguous(), slots=slots
+            key=kv[:, :, 0].contiguous(),
+            value=kv[:, :, 1].contiguous(),
+            slots=slots,
+            kv_scales=self.kv_scales,
         )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, :, 0].contiguous(),
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, :, 1].contiguous(),
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv[:, :, 0],
+                value=kv[:, :, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.dense(
diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
index 5972d436bd3..ed053eb661c 100644
--- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py
@@ -17,7 +17,7 @@
     TensorParallelEmbedding,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.gptq import GPTQWeightsLoader
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
@@ -258,6 +258,7 @@ def __init__(self, prefix, config, weights):
         self.c_proj = load_row(
             config, prefix=f"{prefix}.c_proj", weights=weights, bias=True
         )
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
         self.kv_head_mapping = torch.zeros(
             self.num_heads, dtype=torch.int32, device=weights.device
         )
@@ -283,30 +284,37 @@ def forward(
         query = query.view(-1, self.num_heads, self.head_size)
         key_value = key_value.view(-1, 2, 1, self.head_size)
 
-        kv_cache.store(key=key_value[:, 0], value=key_value[:, 1], slots=slots)
+        kv_cache.store(
+            key=key_value[:, 0],
+            value=key_value[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else key_value[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else key_value[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=key_value[:, 0],
+                value=key_value[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
index 037238b8a20..c793982d828 100644
--- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
+++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py
@@ -38,7 +38,7 @@
     SpeculativeHead,
     get_linear,
 )
-from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE
+from text_generation_server.layers.attention.kv_cache import get_kv_scales
 from text_generation_server.layers.layernorm import (
     FastLayerNorm,
     FastRMSNorm,
@@ -189,6 +189,7 @@ def __init__(
         )
 
         self.query_key_value = load_attention(config, prefix, weights)
+        self.kv_scales = get_kv_scales(weights, f"{prefix}")
 
         self.o_proj = TensorParallelRowLinear.load(
             config,
@@ -232,31 +233,38 @@ def forward(
         else:
             kv_to_cache = kv
 
-        kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots)
+        kv_cache.store(
+            key=kv_to_cache[:, 0],
+            value=kv_to_cache[:, 1],
+            slots=slots,
+            kv_scales=self.kv_scales,
+        )
 
         # Prefill
         if cu_seqlen_prefill is not None:
             # flash attention
             attn_output = attention(
-                query,
-                kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0],
-                kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1],
-                seqlen,
-                block_tables,
-                self.softmax_scale,
+                query=query,
+                key=kv_to_cache[:, 0],
+                value=kv_to_cache[:, 1],
+                kv_cache=kv_cache,
+                kv_scales=self.kv_scales,
+                seqlen=seqlen,
+                block_tables=block_tables,
+                softmax_scale=self.softmax_scale,
                 window_size_left=self.max_past,
             )
         # Decode
         else:
             attn_output = paged_attention(
                 query,
-                kv_cache.key,
-                kv_cache.value,
+                kv_cache,
                 self.kv_head_mapping,
                 self.softmax_scale,
                 block_tables,
                 seqlen,
                 max_s,
+                kv_scales=self.kv_scales,
             )
 
         return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size))
diff --git a/server/text_generation_server/models/custom_modeling/mllama.py b/server/text_generation_server/models/custom_modeling/mllama.py
index 6e091a74fe8..be0a4b5d7c3 100644
--- a/server/text_generation_server/models/custom_modeling/mllama.py
+++ b/server/text_generation_server/models/custom_modeling/mllama.py
@@ -493,9 +493,14 @@ def forward(
         aspect_ratio_ids: torch.Tensor,
         attention_mask: torch.Tensor,
     ) -> torch.Tensor:
-        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = (
-            pixel_values.shape
-        )
+        (
+            batch_size,
+            num_concurrent_media,
+            num_tiles,
+            num_channels,
+            height,
+            width,
+        ) = pixel_values.shape
 
         pixel_values = pixel_values.reshape(
             batch_size * num_concurrent_media * num_tiles, num_channels, height, width
diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py
index 33fe30a87cd..b931671cc0c 100644
--- a/server/text_generation_server/models/flash_causal_lm.py
+++ b/server/text_generation_server/models/flash_causal_lm.py
@@ -16,7 +16,17 @@
     AutoTokenizer,
     GenerationConfig,
 )
-from typing import Any, ContextManager, Iterable, Optional, Tuple, List, Type, Dict
+from typing import (
+    Any,
+    ContextManager,
+    Iterable,
+    Optional,
+    Tuple,
+    List,
+    Type,
+    Dict,
+    Union,
+)
 
 from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
@@ -24,6 +34,10 @@
 from text_generation_server.utils.import_utils import SYSTEM
 from text_generation_server.models import Model
 from text_generation_server.utils.log import log_master
+from text_generation_server.utils.prefill_chunking import (
+    get_support_chunking,
+    get_max_prefill_tokens,
+)
 from text_generation_server.utils.tokens import batch_top_tokens
 from text_generation_server.utils.speculate import get_speculate
 from text_generation_server.utils import (
@@ -60,7 +74,6 @@
 
 tracer = trace.get_tracer(__name__)
 
-
 # Will be set in init
 SLIDING_WINDOW: Optional[int] = None
 
@@ -117,45 +130,48 @@ class FlashCausalLMBatch(Batch):
     requests_idx_mapping: Dict[int, int]
 
     # Decoder values
-    input_ids: torch.Tensor
-    position_ids: torch.Tensor
+    # Can be a list for easy filtering
+    # If `input_ids` is a list, it needs to be materialized to a tensor first
+    input_ids: Union[torch.Tensor, List[List[int]]]
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    position_ids: Optional[torch.Tensor]
     speculative_ids: Optional[torch.Tensor]
 
-    # Flash Attention values
-
-    # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
-    cu_seqlen_prefill: Optional[torch.Tensor]
-    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
-    # as we only keep SLIDING_WINDOW values instead of the whole tensor
-    prefill_cache_indices: Optional[torch.Tensor]
-
-    # Paged Attention values
-
     # Set when creating the batch
-    # CPU tensor of length b indicating the start of each sequence in slots
-    start_slots: torch.Tensor
     # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode
-    slot_indices: torch.Tensor
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    slot_indices: Optional[torch.Tensor]
 
     # list of length b of list of length s_i // block_size
     block_tables: List[List[int]]
     # tensor of size [b, max_total_seqlen // block_size] holding the paged attention block tables for all sequences
     block_tables_tensor: torch.Tensor
     # tensor of length \sum_{i=0}^{b} max_s_i  holding the paged attention slots for all sequences
-    slots: torch.Tensor
-    # size [b], containing the number of blocks that can be retrieved from the cache
-    prefix_lens: List[int]
-    prefix_lens_tensor: torch.Tensor
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    slots: Optional[torch.Tensor]
 
-    max_seqlen: int
+    max_input_length: int
+    max_current_length: int
+
+    # Whether this batch contains at least one request that is prefilling
+    prefilling: bool
+    # Whether each request is prefilling
+    prefilling_mask: List[bool]
 
     # Prefill metadata tensors to efficiently compute logprobs
+    # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill
+    cu_seqlen_prefill: Optional[torch.Tensor]
+    # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers
+    # as we only keep SLIDING_WINDOW values instead of the whole tensor
+    prefill_cache_indices: Optional[torch.Tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_head_indices: Optional[torch.Tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_next_token_indices: Optional[torch.tensor]
+    # Will be set by `generate_token` and reset after each prefill forward
     prefill_cu_outlens: Optional[List[int]]
-
-    # Prefixes
-    prefix_ids: List[List[int]]
+    # Will be set by `generate_token` and reset after each prefill forward
+    prefill_logprob_tokens: List[Optional[Tokens]]
 
     # All tokens
     all_input_ids: List[List[int]]
@@ -163,7 +179,14 @@ class FlashCausalLMBatch(Batch):
 
     # Lengths of all generations present in the batch
     input_lengths: List[int]
-    input_lengths_tensor: torch.Tensor
+    # size [b], containing the number of blocks that can be retrieved from the cache
+    cache_lengths: List[int]
+    prompt_lengths: List[int]
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    input_lengths_tensor: Optional[torch.Tensor]
+    cache_lengths_tensor: Optional[torch.Tensor]
+    prompt_lengths_tensor: torch.Tensor
+
     prefix_offsets: List[Optional[int]]
     read_offsets: List[Optional[int]]
 
@@ -174,7 +197,8 @@ class FlashCausalLMBatch(Batch):
     top_n_tokens_tensor: torch.Tensor
 
     # Adapter metadata for each request
-    adapter_meta: AdapterBatchMetadata
+    # Will be set by `generate_token` and reset after each prefill forward before staying set in decode
+    adapter_meta: Optional[AdapterBatchMetadata]
 
     # Number of blocks in this batch
     num_blocks: int
@@ -187,6 +211,11 @@ def to_pb(self) -> generate_pb2.CachedBatch:
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.num_blocks * BLOCK_SIZE,
+            current_tokens=(
+                sum([len(i) for i in self.input_ids])
+                if isinstance(self.input_ids, list)
+                else len(self.input_ids)
+            ),
         )
 
     @classmethod
@@ -218,46 +247,28 @@ def from_tokenized(
         dtype: torch.dtype,
         device: torch.device,
     ) -> "FlashCausalLMBatch":
-        sliding_window = get_sliding_windows()
-        position_ids = []
-        cu_seqlen_prefill = [0]
-        start_slots = []
-        slot_indices = []
-        prefill_cache_indices = []
+        speculate = get_speculate()
 
+        cache_lengths = []
         input_lengths = []
+        prompt_lengths = []
         prefix_offsets = []
         read_offsets = []
         all_input_ids = []
-        prefix_ids = []
+        all_postfix_ids = []
         requests_idx_mapping = {}
 
-        all_prefill_logprobs = True
-        no_prefill_logprobs = True
-        prefill_head_indices = []
-        prefill_next_token_indices = []
-        prefill_cu_outlens = [0]
-
         next_token_chooser_parameters = []
         stopping_criterias = []
         top_n_tokens = []
 
-        adapter_indices_list = []
-        adapter_set = set()
-
-        # Cumulative length
-        cumulative_length = 0
-        cumulative_slot_tokens = 0
-        prefill_out_cumulative_length = 0
-
         num_blocks = 0
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
         max_length = 0
         max_blocks = 0
 
         block_tables = []
-        slots = []
-        prefix_lens = []
 
         # Parse batch
         for i, (r, tokenized_input) in enumerate(
@@ -266,38 +277,47 @@ def from_tokenized(
             # request id -> idx in list mapping
             requests_idx_mapping[r.id] = i
 
-            orig_input_length = len(tokenized_input)
+            prompt_length = len(tokenized_input)
+            prompt_lengths.append(prompt_length)
+
+            cache_length = r.cache_len
 
-            prefix_len = r.prefix_len
             assert (
-                prefix_len <= orig_input_length
-            ), f"Prefix {prefix_len} vs input {orig_input_length}"
-            if prefix_len == orig_input_length:
-                assert prefix_len > 0
-                prefix_len -= 1
-
-            # Commented as it's costly.
-            # log_master(logger.debug, "Tokenized input ids {tokenized_input}")
-            prefix_ids.append(tokenized_input[:prefix_len])
-            tokenized_input = tokenized_input[prefix_len:]
-
-            input_length = len(tokenized_input)
+                cache_length <= prompt_length
+            ), f"Prefix {cache_length} vs input {prompt_length}"
+            if cache_length == prompt_length:
+                assert False, "unreachable"
+
+            # `chunk_len` is an optional field in the protobuf
+            # It is only set if the model support chunking
+            if r.HasField("chunk_len"):
+                input_length = r.chunk_len
+
+                if cache_length + input_length < prompt_length:
+                    # FIXME: speculate is not supported for context chunking at the moment
+                    assert speculate == 0
+                    assert get_support_chunking()
+                    assert input_length > 0
+
+                postfix_ids = tokenized_input[
+                    cache_length : cache_length + input_length
+                ]
+                assert (
+                    len(postfix_ids) == input_length
+                ), "Rust and Python tokenizers are not aligned"
+            else:
+                # Use all the remaining ids
+                postfix_ids = tokenized_input[cache_length:]
+                input_length = len(postfix_ids)
+
             input_lengths.append(input_length)
 
-            prefix_offsets.append(input_length - 5)
-            read_offsets.append(input_length)
+            prefix_offsets.append(prompt_length - 5)
+            read_offsets.append(prompt_length)
 
+            all_postfix_ids.append(postfix_ids)
             all_input_ids.append(tokenized_input)
 
-            # Position ids
-            request_position_ids = torch.arange(
-                prefix_len, orig_input_length, dtype=torch.int32
-            )
-            position_ids.append(request_position_ids)
-
-            # Add cumulative lengths of all previous inputs
-            cu_seqlen_prefill.append(cumulative_length + input_length)
-
             next_token_chooser_parameters.append(r.parameters)
 
             stopping_criteria = StoppingCriteria.from_pb(
@@ -307,22 +327,13 @@ def from_tokenized(
             stopping_criterias.append(stopping_criteria)
             top_n_tokens.append(r.top_n_tokens)
 
-            ADAPTER_TO_INDEX = get_adapter_to_index()
-            adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
-            adapter_indices_list.append(torch.full((input_length,), adapter_index))
-            adapter_set.add(adapter_index)
-
             # Paged attention
             # Remove one as the first token des not have a past
             speculative_length = get_speculate()
             speculative_length = 0 if speculative_length is None else speculative_length
 
             # Tokens that need to be mapped to blocks.
-            block_tokens = orig_input_length + max_new_tokens - 1 + speculative_length
-
-            # Tokens that need to be mapped to slots. We don't need slots for the
-            # cached prefix (if present).
-            slot_tokens = input_length + max_new_tokens - 1 + speculative_length
+            block_tokens = prompt_length + max_new_tokens - 1 + speculative_length
 
             # blocks and slots can be empty (for example in warmup)
             if not r.blocks:
@@ -330,77 +341,26 @@ def from_tokenized(
                 request_blocks = [
                     b for b in range(num_blocks, num_blocks + needed_blocks)
                 ]
-                request_slots = [
-                    s
-                    for b in request_blocks
-                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
-                ]
             else:
                 request_blocks = r.blocks
-                request_slots = r.slots[
-                    prefix_len:  #: orig_input_length + max_new_tokens + speculative_length
-                ]
 
             block_tables.append(request_blocks)
 
-            slots.extend(request_slots)
-            prefix_lens.append(prefix_len)
+            cache_lengths.append(cache_length)
             num_blocks += len(request_blocks)
-            start_slots.append(cumulative_slot_tokens)
-
-            request_slot_indices = torch.arange(
-                cumulative_slot_tokens,
-                cumulative_slot_tokens + input_length,
-                dtype=torch.int64,
-            )
-            slot_indices.append(request_slot_indices)
-
-            # Create tensor to slice into the kv tensor in prefill
-            if sliding_window is not None:
-                request_prefill_cache_indices = torch.arange(
-                    cumulative_length + max(0, input_length - sliding_window),
-                    cumulative_length + input_length,
-                    dtype=torch.int64,
-                )
-                prefill_cache_indices.append(request_prefill_cache_indices)
-
-            all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs
-            no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs
-
-            if r.prefill_logprobs:
-                prefill_head_indices.append(request_position_ids + cumulative_length)
-                prefill_next_token_indices.append(
-                    prefill_out_cumulative_length + input_length - 1
-                )
-                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
-                prefill_out_cumulative_length += input_length
-            else:
-                prefill_head_indices.append(
-                    torch.tensor(
-                        [cumulative_length + input_length - 1], dtype=torch.int32
-                    )
-                )
-                prefill_next_token_indices.append(prefill_out_cumulative_length)
-                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
-                prefill_out_cumulative_length += 1
 
             # Update
-            cumulative_length += input_length
-            cumulative_slot_tokens += slot_tokens
-            max_seqlen = max(max_seqlen, input_length)
             max_blocks = max(max_blocks, len(request_blocks))
+            max_input_length = max(max_input_length, input_length)
+            max_current_length = max(max_current_length, cache_length + input_length)
             max_length = max(
-                max_length, input_length + max_new_tokens + speculative_length
+                max_length,
+                prompt_length + max_new_tokens + speculative_length,
             )
 
-        adapter_indices = torch.cat(adapter_indices_list).to(
-            dtype=torch.int64, device=device
-        )
-
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             next_token_chooser_parameters, dtype, device, tokenizer
         )
-        start_slots = torch.tensor(start_slots, dtype=torch.int64)
 
         # Padded all_input_ids_tensor
         all_input_ids_tensor = np.zeros(
@@ -414,103 +374,59 @@ def from_tokenized(
             all_input_ids_tensor, dtype=torch.int64, device=device
         )
 
-        if len(pb.requests) > 1:
-            input_ids = np.concatenate(all_input_ids, dtype=np.int64)
-            position_ids = torch.cat(position_ids)
-            slot_indices = torch.cat(slot_indices)
-            if sliding_window is not None:
-                prefill_cache_indices = torch.cat(prefill_cache_indices)
-        else:
-            input_ids = all_input_ids[0]
-            position_ids = position_ids[0]
-            slot_indices = slot_indices[0]
-            if sliding_window is not None:
-                prefill_cache_indices = prefill_cache_indices[0]
-
-        cu_seqlen_prefill = torch.tensor(
-            cu_seqlen_prefill, device=device, dtype=torch.int32
-        )
-        position_ids = position_ids.to(device)
-        slot_indices = slot_indices.to(device)
-        prefill_cache_indices = (
-            prefill_cache_indices.to(device) if sliding_window is not None else None
-        )
-        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
-        input_lengths_tensor = torch.tensor(
-            input_lengths, dtype=torch.int32, device=device
-        )
-
-        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
-        adapter_segments = torch.tensor(
-            adapter_segments, dtype=torch.int32, device=device
-        )
-
-        if all_prefill_logprobs:
-            prefill_head_indices = None
-            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
-        elif no_prefill_logprobs:
-            prefill_head_indices = cu_seqlen_prefill[1:] - 1
-            prefill_next_token_indices = None
-        else:
-            prefill_head_indices = torch.tensor(
-                torch.cat(prefill_head_indices), dtype=torch.int64, device=device
-            )
-            prefill_next_token_indices = torch.tensor(
-                prefill_next_token_indices, dtype=torch.int64, device=device
-            )
         top_n_tokens_tensor = torch.tensor(
             top_n_tokens, device=device, dtype=torch.int64
         )
 
-        slots = torch.tensor(slots, dtype=torch.int64, device=device)
-
         block_tables_tensor = torch.zeros(
             (len(block_tables), max_blocks), dtype=torch.int32, device="cpu"
         )
         for i, request_blocks in enumerate(block_tables):
             block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks)
         block_tables_tensor = block_tables_tensor.to(device)
-        prefix_lens_tensor = torch.tensor(prefix_lens, dtype=torch.int32, device=device)
+        prompt_lengths_tensor = torch.tensor(
+            prompt_lengths, dtype=torch.int32, device=device
+        )
 
         return cls(
             batch_id=pb.id,
             requests=pb.requests,
             requests_idx_mapping=requests_idx_mapping,
-            input_ids=input_ids,
-            position_ids=position_ids,
-            cu_seqlen_prefill=cu_seqlen_prefill,
-            prefill_cache_indices=prefill_cache_indices,
-            start_slots=start_slots,
-            slot_indices=slot_indices,
+            input_ids=all_postfix_ids,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
-            slots=slots,
-            prefix_lens=prefix_lens,
-            prefix_lens_tensor=prefix_lens_tensor,
-            max_seqlen=max_seqlen,
-            prefill_head_indices=prefill_head_indices,
-            prefill_next_token_indices=prefill_next_token_indices,
-            prefill_cu_outlens=prefill_cu_outlens,
+            cache_lengths=cache_lengths,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=True,
+            prefilling_mask=[True] * len(pb.requests),
+            prefill_logprob_tokens=[None] * len(pb.requests),
             input_lengths=input_lengths,
-            input_lengths_tensor=input_lengths_tensor,
+            prompt_lengths=prompt_lengths,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
-            prefix_ids=prefix_ids,
             next_token_chooser=next_token_chooser,
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
             top_n_tokens_tensor=top_n_tokens_tensor,
             num_blocks=num_blocks,
             max_blocks=max_blocks,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
             speculative_ids=None,
+            prompt_lengths_tensor=prompt_lengths_tensor,
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids=None,
+            cu_seqlen_prefill=None,
+            prefill_cache_indices=None,
+            slot_indices=None,
+            slots=None,
+            prefill_head_indices=None,
+            prefill_next_token_indices=None,
+            prefill_cu_outlens=None,
+            cache_lengths_tensor=None,
+            input_lengths_tensor=None,
+            adapter_meta=None,
         )
 
     @classmethod
@@ -533,7 +449,7 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
         if len(request_ids) == len(self):
             return self
 
-        device = self.input_ids.device
+        device = self.block_tables_tensor.device
 
         # New values after filtering
         requests_idx_mapping = {}
@@ -548,19 +464,23 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
 
         # Create on CPU to only move to GPU once instead of at every copy
         slot_indices = torch.empty(len(request_ids), dtype=torch.int64)
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
 
         requests = []
-        start_slots = []
         block_tables = []
         all_input_ids = []
-        prefix_ids = []
+        input_ids = []
 
+        prompt_lengths = []
         input_lengths = []
-        prefix_lens = []
+        cache_lengths = []
         prefix_offsets = []
         read_offsets = []
 
+        prefilling_mask = []
+        prefill_logprob_tokens = []
+
         stopping_criterias = []
         top_n_tokens = []
         adapter_set = set()
@@ -577,16 +497,23 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
 
             requests.append(self.requests[idx])
 
+            # Prefilling
+            request_prefilling = self.prefilling_mask[idx]
+            prefilling_mask.append(request_prefilling)
+
             # Get length
             request_input_length = self.input_lengths[idx]
-            prefix_len = self.prefix_lens[idx]
-            max_seqlen = max(max_seqlen, request_input_length)
+            request_cache_length = self.cache_lengths[idx]
+            max_input_length = max(max_input_length, request_input_length)
+            max_current_length = max(
+                max_current_length, request_cache_length + request_input_length
+            )
 
             all_input_ids.append(self.all_input_ids[idx])
-            prefix_ids.append(self.prefix_ids[idx])
 
+            prompt_lengths.append(self.prompt_lengths[idx])
             input_lengths.append(request_input_length)
-            prefix_lens.append(prefix_len)
+            cache_lengths.append(request_cache_length)
             prefix_offsets.append(self.prefix_offsets[idx])
             read_offsets.append(self.read_offsets[idx])
 
@@ -594,60 +521,79 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
             stopping_criterias.append(stopping_criteria)
 
             top_n_tokens.append(self.top_n_tokens[idx])
+            prefill_logprob_tokens.append(self.prefill_logprob_tokens[idx])
 
             ADAPTER_TO_INDEX = get_adapter_to_index()
             adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0)
             adapter_set.add(adapter_index)
 
-            remaining_tokens = (
-                stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
-            )
-
             request_block_table = self.block_tables[idx]
             num_blocks += len(request_block_table)
             block_tables.append(request_block_table)
-            start_slots.append(cumulative_max_length)
 
-            # Copy to tensor (CPU)
-            slot_indices[i] = cumulative_max_length + request_input_length - 1
+            # Input ids if the request was part of a prefilling batch
+            # If the batch was decoding we can index into the tensor directly later
+            if self.prefilling:
+                input_ids.append(self.input_ids[idx])
+            else:
+                # Copy to tensor (CPU)
+                slot_indices[i] = cumulative_max_length
+
+                remaining_tokens = (
+                    stopping_criteria.max_new_tokens - stopping_criteria.current_tokens
+                )
 
-            # Set slice
-            slot_filtering_indices[
-                self.start_slots[idx] : self.start_slots[idx]
-                + request_input_length
-                + remaining_tokens
-                - 1
-            ] = True
+                # Set slice
+                slot_filtering_indices[
+                    self.slot_indices[idx] : self.slot_indices[idx]
+                    + request_input_length
+                    + remaining_tokens
+                    - 1
+                ] = True
 
-            cumulative_max_length += request_input_length + remaining_tokens - 1
+                cumulative_max_length += request_input_length + remaining_tokens - 1
 
             max_blocks = max(max_blocks, len(request_block_table))
 
-        # Index into tensors
-        input_ids = self.input_ids[indices]
-        position_ids = self.position_ids[indices]
-        adapter_indices = self.adapter_meta.adapter_indices[indices]
         all_input_ids_tensor = self.all_input_ids_tensor[indices]
         block_tables_tensor = self.block_tables_tensor[indices]
-        input_lengths_tensor = self.input_lengths_tensor[indices]
-        slots = self.slots[slot_filtering_indices]
-        prefix_lens_tensor = self.prefix_lens_tensor[indices]
         next_token_chooser = self.next_token_chooser.filter(indices)
         top_n_tokens_tensor = self.top_n_tokens_tensor[indices]
         speculative_ids = (
             self.speculative_ids[indices] if self.speculative_ids is not None else None
         )
-
-        start_slots = torch.tensor(start_slots, dtype=torch.int64)
-
-        # Move to GPU now that we have the whole tensor
-        slot_indices = slot_indices.to(device)
-
-        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
-        adapter_segments = torch.tensor(
-            adapter_segments, dtype=torch.int32, device=device
-        )
-        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
+        prompt_lengths_tensor = self.prompt_lengths_tensor[indices]
+
+        if self.prefilling:
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids = None
+            slot_indices = None
+            slots = None
+            cache_lengths_tensor = None
+            input_lengths_tensor = None
+            adapter_meta = None
+        else:
+            # Index into tensors
+            input_ids = self.input_ids[indices]
+            position_ids = self.position_ids[indices]
+            adapter_indices = self.adapter_meta.adapter_indices[indices]
+            input_lengths_tensor = self.input_lengths_tensor[indices]
+            slots = self.slots[slot_filtering_indices]
+            cache_lengths_tensor = self.cache_lengths_tensor[indices]
+
+            # Move to GPU now that we have the whole tensor
+            slot_indices = slot_indices.to(device)
+
+            adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+            adapter_segments = torch.tensor(
+                adapter_segments, dtype=torch.int32, device=device
+            )
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            )
 
         return type(self)(
             batch_id=self.batch_id,
@@ -657,24 +603,28 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
             position_ids=position_ids,
             cu_seqlen_prefill=None,
             prefill_cache_indices=None,
-            start_slots=start_slots,
             slot_indices=slot_indices,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
             slots=slots,
-            max_seqlen=max_seqlen,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=self.prefilling,
+            prefilling_mask=prefilling_mask,
             prefill_head_indices=None,
             prefill_next_token_indices=None,
             prefill_cu_outlens=None,
+            prefill_logprob_tokens=prefill_logprob_tokens,
+            prompt_lengths=prompt_lengths,
+            prompt_lengths_tensor=prompt_lengths_tensor,
             input_lengths=input_lengths,
             input_lengths_tensor=input_lengths_tensor,
-            prefix_lens=prefix_lens,
-            prefix_lens_tensor=prefix_lens_tensor,
+            cache_lengths=cache_lengths,
+            cache_lengths_tensor=cache_lengths_tensor,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
-            prefix_ids=prefix_ids,
             next_token_chooser=next_token_chooser,
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
@@ -682,12 +632,7 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch":
             num_blocks=num_blocks,
             max_blocks=max_blocks,
             speculative_ids=speculative_ids,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
+            adapter_meta=adapter_meta,
         )
 
     @classmethod
@@ -697,74 +642,98 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
         requests = []
         requests_idx_mapping = {}
 
+        prefilling = False
         num_blocks = 0
         total_batch_size = 0
         total_slots = 0
         max_blocks = 0
         max_length = 0
-        max_seqlen = 0
+        max_input_length = 0
+        max_current_length = 0
         for b in batches:
             total_batch_size += len(b)
-            total_slots += len(b.slots)
+            max_blocks = max(max_blocks, b.max_blocks)
+            # If `b` is prefilling and was just filtered, `b.slots` is None
+            # `total_slots` is not used if any of the batches is prefilling
+            total_slots += len(b.slots) if not b.prefilling else 0
             num_blocks += b.num_blocks
             speculative_length = (
                 b.speculative_ids.shape[1] if b.speculative_ids is not None else 0
             )
-            max_blocks = max(max_blocks, b.max_blocks)
-            max_seqlen = max(max_seqlen, b.max_seqlen)
+            max_input_length = max(max_input_length, b.max_input_length)
+            max_current_length = max(max_current_length, b.max_current_length)
             max_length = max(
                 max_length,
                 max(
-                    input_length
+                    prompt_length
                     + stopping_criteria.max_new_tokens
                     + speculative_length
-                    - stopping_criteria.current_tokens
-                    for input_length, stopping_criteria in zip(
-                        b.input_lengths, b.stopping_criterias
+                    for prompt_length, stopping_criteria in zip(
+                        b.prompt_lengths, b.stopping_criterias
                     )
                 ),
             )
+            prefilling = prefilling or b.prefilling
+
+        if prefilling:
+            input_ids = []
+            # These values will be set by `FlashCausalLMBatch.prepare_for_prefill`
+            position_ids = None
+            slots = None
+            slot_indices = None
+            cache_lengths_tensor = None
+            input_lengths_tensor = None
+            adapter_meta = None
+            adapter_segment_builder = None
+        else:
+            input_ids = batches[0].input_ids.new_empty(total_batch_size)
+            position_ids = batches[0].position_ids.new_empty(total_batch_size)
+            slots = batches[0].slots.new_empty(total_slots)
+            slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
+            input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
+                total_batch_size
+            )
+            cache_lengths_tensor = batches[0].cache_lengths_tensor.new_empty(
+                total_batch_size
+            )
+            total_indices_size = sum(
+                b.adapter_meta.adapter_indices.shape[0] for b in batches
+            )
+            adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
+                total_indices_size
+            )
+            adapter_segment_builder = SegmentConcatBuilder()
+            adapter_set = set()
 
-        input_ids = batches[0].input_ids.new_empty(total_batch_size)
-        position_ids = batches[0].position_ids.new_empty(total_batch_size)
-        slots = batches[0].slots.new_empty(total_slots)
-        slot_indices = batches[0].slot_indices.new_empty(total_batch_size)
-        input_lengths_tensor = batches[0].input_lengths_tensor.new_empty(
+        prompt_lengths_tensor = batches[0].prompt_lengths_tensor.new_empty(
             total_batch_size
         )
         block_tables_tensor = batches[0].block_tables_tensor.new_zeros(
             (total_batch_size, max_blocks)
         )
-        prefix_lens_tensor = batches[0].prefix_lens_tensor.new_empty(total_batch_size)
         all_input_ids_tensor = batches[0].all_input_ids_tensor.new_zeros(
             (total_batch_size, max_length)
         )
         top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros(
             total_batch_size,
         )
-        total_indices_size = sum(
-            b.adapter_meta.adapter_indices.shape[0] for b in batches
-        )
-        adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty(
-            total_indices_size
-        )
-        adapter_set = set()
-        adapter_segment_builder = SegmentConcatBuilder()
 
-        start_slots = []
         block_tables = []
-        prefix_lens = []
+        cache_lengths = []
         all_input_ids = []
-        prefix_ids = []
 
+        prompt_lengths = []
         input_lengths = []
         prefix_offsets = []
         read_offsets = []
 
+        prefill_logprob_tokens = []
+
         next_token_chooser_parameters = []
         fsm_grammar_states = []
         stopping_criterias = []
         top_n_tokens = []
+        prefilling_mask = []
 
         # Cumulative length
         cumulative_batch_size = 0
@@ -783,32 +752,9 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
 
             start_index = cumulative_batch_size
             end_index = cumulative_batch_size + len(batch)
-            slots_start_index = cumulative_slots
-            slots_end_index = cumulative_slots + len(batch.slots)
 
             # Copy tensors (GPU)
-            input_ids[start_index:end_index] = batch.input_ids
-            position_ids[start_index:end_index] = batch.position_ids
-            slot_indices[start_index:end_index] = batch.slot_indices + cumulative_slots
-            input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
             top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor
-            slots[slots_start_index:slots_end_index] = batch.slots
-
-            # Copy over adapter indices
-            adapter_start_index = cumulative_adapter_indices_size
-            adapter_end_index = (
-                cumulative_adapter_indices_size
-                + batch.adapter_meta.adapter_indices.shape[0]
-            )
-            adapter_indices[adapter_start_index:adapter_end_index] = (
-                batch.adapter_meta.adapter_indices
-            )
-            cumulative_adapter_indices_size = adapter_end_index
-            adapter_set.update(batch.adapter_meta.adapter_set)
-            adapter_segment_builder.concat(
-                batch.adapter_meta.adapter_segments, batch.adapter_meta.segment_indices
-            )
-
             all_input_ids_tensor[
                 start_index:end_index, : batch.all_input_ids_tensor.shape[1]
             ] = batch.all_input_ids_tensor[:, :max_length]
@@ -816,20 +762,56 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
             block_tables_tensor[
                 start_index:end_index, : batch.block_tables_tensor.shape[1]
             ] = batch.block_tables_tensor[:, :max_blocks]
+            prompt_lengths_tensor[start_index:end_index] = batch.prompt_lengths_tensor
 
-            prefix_lens_tensor[start_index:end_index] = batch.prefix_lens_tensor
+            if not prefilling:
+                slots_start_index = cumulative_slots
+                slots_end_index = cumulative_slots + len(batch.slots)
 
-            start_slots.append(batch.start_slots + cumulative_slots)
+                input_ids[start_index:end_index] = batch.input_ids
+                position_ids[start_index:end_index] = batch.position_ids
+                slots[slots_start_index:slots_end_index] = batch.slots
+                slot_indices[start_index:end_index] = (
+                    batch.slot_indices + cumulative_slots
+                )
+                input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor
+                cache_lengths_tensor[start_index:end_index] = batch.cache_lengths_tensor
+
+                # Copy over adapter indices
+                adapter_start_index = cumulative_adapter_indices_size
+                adapter_end_index = (
+                    cumulative_adapter_indices_size
+                    + batch.adapter_meta.adapter_indices.shape[0]
+                )
+                adapter_indices[adapter_start_index:adapter_end_index] = (
+                    batch.adapter_meta.adapter_indices
+                )
+                cumulative_adapter_indices_size = adapter_end_index
+                adapter_set.update(batch.adapter_meta.adapter_set)
+                adapter_segment_builder.concat(
+                    batch.adapter_meta.adapter_segments,
+                    batch.adapter_meta.segment_indices,
+                )
 
+                # Update
+                cumulative_slots += len(batch.slots)
+            else:
+                if isinstance(batch.input_ids, torch.Tensor):
+                    batch.input_ids = batch.input_ids.view(-1, 1).tolist()
+                input_ids.extend(batch.input_ids)
+
+            prefilling_mask.extend(batch.prefilling_mask)
             block_tables.extend(batch.block_tables)
-            prefix_lens.extend(batch.prefix_lens)
+            cache_lengths.extend(batch.cache_lengths)
             all_input_ids.extend(batch.all_input_ids)
-            prefix_ids.extend(batch.prefix_ids)
 
+            prompt_lengths.extend(batch.prompt_lengths)
             input_lengths.extend(batch.input_lengths)
             prefix_offsets.extend(batch.prefix_offsets)
             read_offsets.extend(batch.read_offsets)
 
+            prefill_logprob_tokens.extend(batch.prefill_logprob_tokens)
+
             next_token_chooser_parameters.extend([r.parameters for r in batch.requests])
             fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states)
             stopping_criterias.extend(batch.stopping_criterias)
@@ -838,11 +820,6 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
 
             # Update
             cumulative_batch_size += len(batch)
-            cumulative_slots += len(batch.slots)
-
-        start_slots = torch.concat(start_slots)
-
-        # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum()
 
         next_token_chooser = HeterogeneousNextTokenChooser.from_pb(
             next_token_chooser_parameters,
@@ -858,7 +835,14 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
             else None
         )
 
-        adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+        if adapter_segment_builder is not None:
+            adapter_segments, adapter_segment_indices = adapter_segment_builder.build()
+            adapter_meta = AdapterBatchMetadata(
+                adapter_indices=adapter_indices,
+                adapter_set=adapter_set,
+                adapter_segments=adapter_segments,
+                segment_indices=adapter_segment_indices,
+            )
 
         return cls(
             batch_id=batches[0].batch_id,
@@ -868,24 +852,28 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
             position_ids=position_ids,
             cu_seqlen_prefill=None,
             prefill_cache_indices=None,
-            start_slots=start_slots,
             slot_indices=slot_indices,
             block_tables=block_tables,
             block_tables_tensor=block_tables_tensor,
-            prefix_lens=prefix_lens,
-            prefix_lens_tensor=prefix_lens_tensor,
+            cache_lengths=cache_lengths,
+            cache_lengths_tensor=cache_lengths_tensor,
             slots=slots,
-            max_seqlen=max_seqlen,
+            max_input_length=max_input_length,
+            max_current_length=max_current_length,
+            prefilling=prefilling,
+            prefilling_mask=prefilling_mask,
             prefill_head_indices=None,
             prefill_next_token_indices=None,
             prefill_cu_outlens=None,
+            prefill_logprob_tokens=prefill_logprob_tokens,
+            prompt_lengths=prompt_lengths,
+            prompt_lengths_tensor=prompt_lengths_tensor,
             input_lengths=input_lengths,
             input_lengths_tensor=input_lengths_tensor,
             prefix_offsets=prefix_offsets,
             read_offsets=read_offsets,
             all_input_ids=all_input_ids,
             all_input_ids_tensor=all_input_ids_tensor,
-            prefix_ids=prefix_ids,
             next_token_chooser=next_token_chooser,
             stopping_criterias=stopping_criterias,
             top_n_tokens=top_n_tokens,
@@ -893,12 +881,195 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch
             num_blocks=num_blocks,
             max_blocks=max_blocks,
             speculative_ids=speculative_ids,
-            adapter_meta=AdapterBatchMetadata(
-                adapter_indices=adapter_indices,
-                adapter_set=adapter_set,
-                adapter_segments=adapter_segments,
-                segment_indices=adapter_segment_indices,
-            ),
+            adapter_meta=adapter_meta,
+        )
+
+    def prepare_for_prefill(self):
+        # Prepare values if we need to continue prefilling
+        # Speculation must be ignored while we prefill even with chunking
+        # it simplifies everything
+        assert self.speculative_ids is None
+
+        sliding_window = get_sliding_windows()
+        position_ids = []
+        cu_seqlen_prefill = [0]
+        slot_indices = []
+        prefill_cache_indices = []
+        all_prefill_logprobs = True
+        no_prefill_logprobs = True
+        prefill_head_indices = []
+        prefill_next_token_indices = []
+        prefill_cu_outlens = [0]
+
+        # Cumulative length
+        cumulative_length = 0
+        cumulative_slot_tokens = 0
+        prefill_out_cumulative_length = 0
+
+        slots = []
+        adapter_indices_list = []
+        adapter_set = set()
+
+        for i, (
+            r,
+            cache_length,
+            input_length,
+            prompt_length,
+            request_prefilling,
+            blocks,
+        ) in enumerate(
+            zip(
+                self.requests,
+                self.cache_lengths,
+                self.input_lengths,
+                self.prompt_lengths,
+                self.prefilling_mask,
+                self.block_tables,
+            )
+        ):
+            next_chunk_length = input_length
+            # Position ids
+            request_position_ids = torch.arange(
+                cache_length, cache_length + input_length, dtype=torch.int32
+            )
+            position_ids.append(request_position_ids)
+
+            # Add cumulative lengths of all previous inputs
+            cu_seqlen_prefill.append(cumulative_length + input_length)
+
+            if not r.slots:
+                request_slots = [
+                    s
+                    for b in blocks
+                    for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE)
+                ]
+            else:
+                request_slots = r.slots
+
+            request_slots = request_slots[cache_length:]
+            request_slot_indices = torch.arange(
+                cumulative_slot_tokens,
+                cumulative_slot_tokens + input_length,
+                dtype=torch.int64,
+            )
+
+            # Create tensor to slice into the kv tensor in prefill
+            if sliding_window is not None:
+                request_prefill_cache_indices = torch.arange(
+                    cumulative_length + max(0, input_length - sliding_window),
+                    cumulative_length + input_length,
+                    dtype=torch.int64,
+                )
+
+            # Prefill logprobs is ignored if the request is done prefilling
+            prefill_logprobs = r.prefill_logprobs and request_prefilling
+
+            all_prefill_logprobs = all_prefill_logprobs and prefill_logprobs
+            no_prefill_logprobs = no_prefill_logprobs and not prefill_logprobs
+
+            if prefill_logprobs:
+                prefill_head_indices.append(
+                    torch.arange(
+                        cumulative_length,
+                        cumulative_length + input_length,
+                        dtype=torch.int64,
+                    )
+                )
+                prefill_next_token_indices.append(
+                    prefill_out_cumulative_length + input_length - 1
+                )
+                prefill_cu_outlens.append(prefill_out_cumulative_length + input_length)
+                prefill_out_cumulative_length += input_length
+            else:
+                prefill_head_indices.append(
+                    torch.tensor(
+                        [cumulative_length + input_length - 1],
+                        dtype=torch.int64,
+                    )
+                )
+                prefill_next_token_indices.append(prefill_out_cumulative_length)
+                prefill_cu_outlens.append(prefill_out_cumulative_length + 1)
+                prefill_out_cumulative_length += 1
+
+            slots.extend(request_slots)
+            slot_indices.append(request_slot_indices)
+
+            if sliding_window is not None:
+                prefill_cache_indices.append(request_prefill_cache_indices)
+
+            ADAPTER_TO_INDEX = get_adapter_to_index()
+            adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0)
+            adapter_indices_list.append(torch.full((next_chunk_length,), adapter_index))
+            adapter_set.add(adapter_index)
+
+            # Update
+            cumulative_length += next_chunk_length
+            cumulative_slot_tokens += len(request_slots)
+
+        device = self.block_tables_tensor.device
+
+        if isinstance(self.input_ids, list):
+            if len(self) > 1:
+                input_ids = np.concatenate(self.input_ids, dtype=np.int64)
+            else:
+                input_ids = self.input_ids[0]
+            self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+
+        if len(self) > 1:
+            position_ids = torch.cat(position_ids)
+            slot_indices = torch.cat(slot_indices)
+            if sliding_window is not None:
+                prefill_cache_indices = torch.cat(prefill_cache_indices)
+        else:
+            position_ids = position_ids[0]
+            slot_indices = slot_indices[0]
+            if sliding_window is not None:
+                prefill_cache_indices = prefill_cache_indices[0]
+
+        self.prefill_cu_outlens = prefill_cu_outlens
+        cu_seqlen_prefill = torch.tensor(
+            cu_seqlen_prefill, device=device, dtype=torch.int32
+        )
+        self.cu_seqlen_prefill = cu_seqlen_prefill
+        self.position_ids = position_ids.to(device)
+        self.slot_indices = slot_indices.to(device)
+        self.prefill_cache_indices = (
+            prefill_cache_indices.to(device) if sliding_window is not None else None
+        )
+        self.input_lengths_tensor = torch.tensor(
+            self.input_lengths, dtype=torch.int32, device=device
+        )
+
+        if all_prefill_logprobs:
+            prefill_head_indices = None
+            prefill_next_token_indices = cu_seqlen_prefill[1:] - 1
+        elif no_prefill_logprobs:
+            prefill_head_indices = cu_seqlen_prefill[1:] - 1
+            prefill_next_token_indices = None
+        else:
+            prefill_head_indices = torch.cat(prefill_head_indices).to(device)
+            prefill_next_token_indices = torch.tensor(
+                prefill_next_token_indices, dtype=torch.int64, device=device
+            )
+
+        self.prefill_head_indices = prefill_head_indices
+        self.prefill_next_token_indices = prefill_next_token_indices
+        self.slots = torch.tensor(slots, dtype=torch.int64, device=device)
+        self.cache_lengths_tensor = torch.tensor(
+            self.cache_lengths, dtype=torch.int32, device=device
+        )
+        adapter_indices = torch.cat(adapter_indices_list).to(
+            dtype=torch.int64, device=device
+        )
+        adapter_segments, adapter_segment_indices = find_segments(adapter_indices)
+        adapter_segments = torch.tensor(
+            adapter_segments, dtype=torch.int32, device=device
+        )
+        self.adapter_meta = AdapterBatchMetadata(
+            adapter_indices=adapter_indices,
+            adapter_set=adapter_set,
+            adapter_segments=adapter_segments,
+            segment_indices=adapter_segment_indices,
         )
 
     def __len__(self):
@@ -938,6 +1109,7 @@ def __init__(
         head_size: Optional[int] = None,
         skip_special_tokens: bool = True,
         kv_cache_dtype: Optional[torch.dtype] = None,
+        support_chunking: bool = True,
     ):
         self.quantize = quantize
         self.process_group, rank, world_size = initialize_torch_distributed()
@@ -950,7 +1122,6 @@ def __init__(
                 dtype = default_dtype if dtype is None else dtype
             else:
                 device = torch.device("cpu")
-                # Float16 doesn't exist on target.
                 dtype = torch.bfloat16 if dtype is None else dtype
                 init_cpu_threads_env(rank_id=rank, world_size=world_size)
         else:
@@ -1065,6 +1236,7 @@ def __init__(
             rank=rank,
             world_size=world_size,
             sliding_window=config.sliding_window,
+            support_chunking=support_chunking,
         )
 
     @property
@@ -1101,11 +1273,11 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
         position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device)
         slots = torch.arange(bs, dtype=torch.int64, device=self.device)
         input_lengths = [max_s] * bs
-        prefix_lengths = [0] * bs
+        cache_lengths = [0] * bs
         input_lengths_tensor = (
             torch.ones(bs, dtype=torch.int32, device=self.device) * max_s
         )
-        prefix_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device)
+        cache_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device)
         block_tables = torch.arange(
             max_bt, dtype=torch.int32, device=self.device
         ).repeat(bs)
@@ -1115,7 +1287,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
             block_tables = block_tables_to_ragged(
                 block_tables=block_tables,
                 input_lengths=input_lengths,
-                prefix_lens=prefix_lengths,
+                cache_lengths=cache_lengths,
             )
             from text_generation_server.layers.attention.flashinfer import (
                 create_decode_state_cuda_graphs,
@@ -1144,7 +1316,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
             "block_tables": block_tables,
             "slots": slots,
             "input_lengths": input_lengths_tensor,
-            "prefix_lengths": prefix_lengths_tensor,
+            "cache_lengths": cache_lengths_tensor,
             "state": state,
             "graph": graph,
         }
@@ -1156,11 +1328,11 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
             cu_seqlen_prefill=None,
             input_lengths_tensor=input_lengths_tensor,
             state=state,
-            prefix_lens_tensor=prefix_lengths_tensor,
+            cache_lengths_tensor=cache_lengths_tensor,
         ):
             seqlen = Seqlen(
                 input_lengths=input_lengths_tensor,
-                prefix_lengths=prefix_lengths_tensor,
+                cache_lengths=cache_lengths_tensor,
                 cu_seqlen_q=None,
                 max_q=1,
                 max_k=max_s,
@@ -1184,7 +1356,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
             with torch.cuda.graph(graph, pool=MEM_POOL):
                 seqlen = Seqlen(
                     input_lengths=input_lengths_tensor,
-                    prefix_lengths=prefix_lengths_tensor,
+                    cache_lengths=cache_lengths_tensor,
                     cu_seqlen_q=None,
                     max_q=1,
                     max_k=max_s,
@@ -1207,6 +1379,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int):
 
     def warmup(self, batch: FlashCausalLMBatch):
         # The warmup batch is the biggest batch we could ever receive
+        self.kv_cache = []
         empty_cache()
 
         try:
@@ -1226,7 +1399,7 @@ def warmup(self, batch: FlashCausalLMBatch):
             _, batch, _ = self.generate_token(batch)
         except torch.cuda.OutOfMemoryError as e:
             raise RuntimeError(
-                f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. "
+                f"Not enough memory to handle {batch.to_pb().current_tokens} prefill tokens. "
                 f"You need to decrease `--max-batch-prefill-tokens`"
             ) from e
 
@@ -1341,14 +1514,16 @@ def tunableop_warmup(self, seqlen: int):
 
         # Dummy value, some models (starcoder2) don't accept `None`.
         input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device)
-        prefix_lens_tensor = torch.zeros(seqlen, dtype=torch.int32, device=self.device)
+        cache_lengths_tensor = torch.zeros(
+            seqlen, dtype=torch.int32, device=self.device
+        )
         cu_seqlen_prefill = torch.tensor(
             [0, seqlen], device=self.device, dtype=torch.int32
         )
         max_s = seqlen
         seqlen = Seqlen(
             input_lengths=input_lengths,
-            prefix_lengths=prefix_lens_tensor,
+            cache_lengths=cache_lengths_tensor,
             cu_seqlen_q=cu_seqlen_prefill,
             max_q=1,
             max_k=seqlen,
@@ -1380,7 +1555,7 @@ def forward(
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
             speculative_ids = batch.speculative_ids
@@ -1399,8 +1574,8 @@ def forward(
             input_lengths = (
                 input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
             ).view(-1)
-            prefix_lens_tensor = (
-                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
             ).reshape(-1)
 
             # Add Copy the block tables for all members
@@ -1422,8 +1597,8 @@ def forward(
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            prefix_lens_tensor = batch.prefix_lens_tensor
-            max_s = batch.max_seqlen
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
         if cu_seqlen_prefill is None and self.max_past() is not None:
@@ -1445,21 +1620,20 @@ def forward(
                 block_tables = block_tables_to_ragged(
                     block_tables=block_tables,
                     input_lengths=batch.input_lengths,
-                    prefix_lens=batch.prefix_lens,
+                    cache_lengths=batch.cache_lengths,
                 )
             with self._forward_context(
                 block_tables=block_tables,
                 cu_seqlen_prefill=cu_seqlen_prefill,
                 input_lengths_tensor=input_lengths,
-                prefix_lens_tensor=prefix_lens_tensor,
+                cache_lengths_tensor=cache_lengths_tensor,
             ):
-                max_k = (input_lengths + prefix_lens_tensor).max().item()
                 seqlen = Seqlen(
                     input_lengths=input_lengths,
-                    prefix_lengths=prefix_lens_tensor,
+                    cache_lengths=cache_lengths_tensor,
                     cu_seqlen_q=cu_seqlen_prefill,
-                    max_q=max_s,
-                    max_k=max_k,
+                    max_q=batch.max_input_length,
+                    max_k=batch.max_current_length,
                 )
                 logits, speculative_logits = self.model.forward(
                     input_ids=input_ids,
@@ -1486,7 +1660,7 @@ def forward(
             block_tables = block_tables_to_ragged(
                 block_tables=block_tables,
                 input_lengths=batch.input_lengths,
-                prefix_lens=batch.prefix_lens,
+                cache_lengths=batch.cache_lengths,
             )
             # assert block_tables.shape[0] >= slots.shape[0]
             cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
@@ -1501,14 +1675,16 @@ def forward(
         cuda_graph["slots"][: slots.shape[0]] = slots
         cuda_graph["input_lengths"].zero_()
         cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
-        cuda_graph["prefix_lengths"].zero_()
-        cuda_graph["prefix_lengths"][: prefix_lens_tensor.shape[0]] = prefix_lens_tensor
+        cuda_graph["cache_lengths"].zero_()
+        cuda_graph["cache_lengths"][
+            : cache_lengths_tensor.shape[0]
+        ] = cache_lengths_tensor
 
         with self._forward_context(
             block_tables=cuda_graph["block_tables"],
             cu_seqlen_prefill=None,
             input_lengths_tensor=cuda_graph["input_lengths"],
-            prefix_lens_tensor=cuda_graph["prefix_lengths"],
+            cache_lengths_tensor=cuda_graph["cache_lengths"],
             state=cuda_graph["state"],
         ):
             # Replay the graph
@@ -1528,7 +1704,10 @@ def generate_token(
         self, batch: FlashCausalLMBatch
     ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]:
         start = time.time_ns()
-        prefill = batch.cu_seqlen_prefill is not None
+        prefill = batch.prefilling
+        if prefill:
+            batch.prepare_for_prefill()
+
         prefill_logprobs = batch.prefill_next_token_indices is not None
 
         # Update adapter indices for speculative tokens (if present)
@@ -1570,14 +1749,62 @@ def generate_token(
                     if prefill_logprobs
                     else speculative_logits
                 )
-            next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty(
-                len(batch)
-            )
-
+            if len(batch) > 1 and prefill_logprobs:
+                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
+                # When batch == 1, we will just use the batch.input_ids values directly
+                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
         else:
+            prefill_logprobs = None
             next_token_logits = out
             next_adapter_indices = batch.adapter_meta.adapter_indices
 
+        finished_prefilling = True
+        next_chunk_lengths = []
+        current_prefilling_mask = batch.prefilling_mask
+        if prefill:
+            if get_support_chunking():
+                next_prefilling_mask = []
+                # Budget in tokens for the next batch
+                # We remove (len(batch) - 1) to always have enough space for at least a single decode
+                # for the remaining requests -1 because the first request does not need to be removed from the budget
+                # (ex: you have one request in the batch, you want it to take the full budget not budget -1)
+                batch_budget = get_max_prefill_tokens() - (len(batch) - 1)
+                # We reverse to prioritize older requests
+                # zip() is not reversible so reverse the underlying lists instead
+                for cache_length, input_length, prompt_length in zip(
+                    reversed(batch.cache_lengths),
+                    reversed(batch.input_lengths),
+                    reversed(batch.prompt_lengths),
+                ):
+                    remaining_prefill_tokens = max(
+                        prompt_length - cache_length - input_length, 0
+                    )
+                    if remaining_prefill_tokens > 0:
+                        next_chunk_length = max(
+                            min(remaining_prefill_tokens, batch_budget), 1
+                        )
+                        batch_budget -= next_chunk_length
+                        finished_prefilling = False
+                        next_prefilling_mask.append(True)
+                    else:
+                        # FIXME: use true number of accepted tokens instead of 1
+                        # Since speculation will be turned off, this is always true
+                        next_chunk_length = 1
+                        next_prefilling_mask.append(False)
+                    next_chunk_lengths.append(next_chunk_length)
+
+                # Reverse back the obtained values²
+                next_chunk_lengths.reverse()
+                next_prefilling_mask.reverse()
+            else:
+                # The model does not support chunking
+                # We know we only do a single prefill
+                finished_prefilling = True
+                next_prefilling_mask = [False] * len(batch)
+
+            batch.prefilling = not finished_prefilling
+            batch.prefilling_mask = next_prefilling_mask
+
         speculate = get_speculate()
         (
             next_input_ids,
@@ -1586,7 +1813,7 @@ def generate_token(
             accepted_ids,
             speculative_ids,
         ) = batch.next_token_chooser(
-            batch.all_input_ids_tensor[:, : batch.max_seqlen],
+            batch.all_input_ids_tensor[:, : batch.max_current_length],
             next_token_logits,
             speculate,
             batch.speculative_ids,
@@ -1597,29 +1824,28 @@ def generate_token(
             batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids
         )
 
-        if prefill:
-            if len(batch) > 1 and prefill_logprobs:
-                # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs
-                # When batch == 1, we will just use the batch.input_ids values directly
-                prefill_tokens_indices = batch.input_ids.new_zeros(len(out))
-
+        # Since we are done prefilling, all the tensors that were concatenating values for all the requests
+        # instantly become of shape [BATCH_SIZE]
+        if prefill and finished_prefilling:
             next_position_ids = batch.position_ids.new_empty(len(batch))
             batch.slot_indices = batch.slot_indices[batch.cu_seqlen_prefill[1:] - 1]
-            # We do not need cu_seqlen_prefill anymore
-            batch.cu_seqlen_prefill = None
-        else:
-            prefill_logprobs = None
+            next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty(
+                len(batch)
+            )
+        elif not prefill:
             next_position_ids = batch.position_ids
 
-        # Cumulative length
-        cumulative_length = 0
-
-        # Results
-        generations: List[Generation] = []
-        stopped = True
-
         # Zipped iterator
-        iterator = zip(batch.input_lengths, batch.all_input_ids, accepted_ids)
+        iterator = zip(
+            batch.requests,
+            batch.prompt_lengths,
+            batch.cache_lengths,
+            batch.input_lengths,
+            batch.all_input_ids,
+            accepted_ids,
+            current_prefilling_mask,
+            batch.prefilling_mask,
+        )
 
         # We do two for loops as the first one can run completely asynchronously from the GPU while for the second
         # one, we need to first do a GPU <-> CPU sync
@@ -1627,16 +1853,22 @@ def generate_token(
 
         # For each member of the batch
         index = 0
-        for i, (input_length, all_input_ids, n_accepted_ids) in enumerate(iterator):
-            # Indexing metadata
-            start_index = cumulative_length
-            end_index = cumulative_length + input_length
-
-            if prefill:
+        # Cumulative length
+        cumulative_length = 0
+        for i, (
+            request,
+            prompt_length,
+            cache_length,
+            input_length,
+            all_input_ids,
+            n_accepted_ids,
+            request_was_prefilling,
+            request_is_prefilling,
+        ) in enumerate(iterator):
+            if prefill and finished_prefilling:
                 # Indexing metadata
-                out_start_index = batch.prefill_cu_outlens[i]
-                out_end_index = batch.prefill_cu_outlens[i + 1]
-                out_length = out_end_index - out_start_index
+                _start_index = cumulative_length
+                end_index = cumulative_length + input_length
 
                 # Initialize position_ids
                 # In decode, we do not need this as we can just increment position ids
@@ -1648,34 +1880,56 @@ def generate_token(
                     end_index - 1
                 ]
 
-                # Used to gather prefill logprobs
-                # Copy batch.input_ids to prefill_token_indices
-                if prefill_logprobs:
-                    if len(batch) > 1:
-                        prefill_tokens_indices[out_start_index : out_end_index - 1] = (
-                            batch.input_ids[start_index + 1 : start_index + out_length]
-                        )
-                    else:
-                        # Set prefill_tokens_indices to the correct slice
-                        prefill_tokens_indices = batch.input_ids[
-                            start_index + 1 : start_index + out_length
-                        ]
-
-            for j in range(n_accepted_ids):
-                batch.all_input_ids_tensor[i, input_length + j] = next_input_ids[index]
-                index += 1
+            # Used to gather prefill logprobs
+            # Copy batch.all_input_ids_tensor to prefill_token_indices
+            if request.prefill_logprobs and request_was_prefilling:
+                # Indexing metadata
+                out_start_index = batch.prefill_cu_outlens[i]
+                out_end_index = batch.prefill_cu_outlens[i + 1]
 
+                # Logprobs generated by the model are for the next token
+                # So we need to translate the id tensor by 1
+                ids = batch.all_input_ids_tensor[
+                    i, cache_length + 1 : cache_length + input_length + 1
+                ]
+                if len(batch) > 1:
+                    prefill_tokens_indices[out_start_index:out_end_index] = ids
+                else:
+                    # Set prefill_tokens_indices to the correct slice
+                    prefill_tokens_indices = ids
+
+            if not request_is_prefilling:
+                # Only save tokens if we are done prefilling for this request
+                for j in range(n_accepted_ids):
+                    batch.all_input_ids_tensor[i, cache_length + input_length + j] = (
+                        next_input_ids[index + j]
+                    )
+            index += n_accepted_ids
             cumulative_length += input_length
 
         # Update values
-        batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1]
-        batch.speculative_ids = speculative_ids
-        batch.position_ids = next_position_ids + accepted_ids
-        batch.input_lengths_tensor += accepted_ids
-        batch.slot_indices += accepted_ids
-        batch.adapter_meta.adapter_indices = next_adapter_indices
+        # These values can be updated without a GPU -> CPU sync
+        if not prefill or (prefill and finished_prefilling):
+            batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1]
+            batch.speculative_ids = speculative_ids
+            batch.position_ids = next_position_ids + accepted_ids
+            batch.cache_lengths_tensor += batch.input_lengths_tensor
+            batch.input_lengths_tensor = accepted_ids.to(dtype=torch.int32)
+            batch.slot_indices += accepted_ids
+            batch.adapter_meta.adapter_indices = next_adapter_indices
 
-        if prefill:
+        if prefill and prefill_logprobs:
+            # Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size))
+            torch.log_softmax(out, -1, out=out)
+            prefill_logprobs_tensor = out
+            prefill_logprobs = torch.gather(
+                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
+            )
+            # GPU <-> CPU sync
+            prefill_logprobs = prefill_logprobs.view(-1).tolist()
+
+        # Does a GPU <-> CPU sync internally
+        if prefill and finished_prefilling:
             # adjust segment lengths to account for all request lengths being 1 during decoding
             adapter_segments, _ = find_segments(batch.adapter_meta.adapter_indices)
             batch.adapter_meta.adapter_segments = torch.tensor(
@@ -1684,192 +1938,282 @@ def generate_token(
                 device=batch.adapter_meta.adapter_segments.device,
             )
 
-        if prefill and prefill_logprobs:
-            # Get prefill logprobs
-            prefill_logprobs_tensor = torch.log_softmax(out, -1)
-            prefill_logprobs = torch.gather(
-                prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1)
-            )
-            # GPU <-> CPU sync
-            prefill_logprobs = prefill_logprobs.view(-1).tolist()
-
         # GPU <-> CPU sync
         next_token_logprobs = next_token_logprobs.tolist()
         next_token_ids = next_input_ids.tolist()
         accepted_ids = accepted_ids.tolist()
+
+        # Update values if we need to continue prefilling
+        # This represents the `else` case of the `Update values` if above
+        # but since this require the `next_token_ids` to be on CPU, it is better to do it here
+        if prefill and not finished_prefilling:
+            # Speculation must be ignored while we prefill even with chunking
+            # it simplifies everything
+            assert batch.speculative_ids is None
+
+            all_postfix_ids = []
+            for i, (
+                request_prefilling,
+                next_token_id,
+                all_input_ids,
+                cache_length,
+                input_length,
+                next_chunk_length,
+            ) in enumerate(
+                zip(
+                    batch.prefilling_mask,
+                    next_token_ids,
+                    batch.all_input_ids,
+                    batch.cache_lengths,
+                    batch.input_lengths,
+                    next_chunk_lengths,
+                )
+            ):
+                if request_prefilling:
+                    next_cache_length = cache_length + input_length
+                    # Get new prompt IDs to prefill
+                    postfix_ids = all_input_ids[
+                        next_cache_length : next_cache_length + next_chunk_length
+                    ]
+                else:
+                    # This request is done prefilling, the new id is the one selected the sampling method
+                    postfix_ids = [next_token_id]
+
+                all_postfix_ids.append(postfix_ids)
+
+            batch.input_ids = all_postfix_ids
+
         start_decode = time.time_ns()
 
+        # Results
+        generations: List[Generation] = []
+        stopped = True
+
         # Zipped iterator
         iterator = zip(
             batch.requests,
+            batch.prompt_lengths,
+            batch.cache_lengths,
             batch.input_lengths,
             batch.prefix_offsets,
             batch.read_offsets,
             batch.stopping_criterias,
             batch.all_input_ids,
-            batch.prefix_ids,
             batch.next_token_chooser.do_sample,
             batch.next_token_chooser.seeds,
             batch.top_n_tokens,
+            current_prefilling_mask,
+            batch.prefilling_mask,
             accepted_ids,
             batch_top_token_ids,
             batch_top_token_logprobs,
         )
 
+        # Reset max_input_length
+        batch.max_input_length = 0
         # For each member of the batch
         index = 0
         for i, (
             request,
+            prompt_length,
+            cache_length,
             input_length,
             prefix_offset,
             read_offset,
             stopping_criteria,
             all_input_ids,
-            prefix_ids,
             do_sample,
             seed,
             top_n_tokens,
+            request_was_prefilling,
+            request_is_prefilling,
             n_accepted_ids,
             top_token_ids,
             top_token_logprobs,
         ) in enumerate(iterator):
-            # Append next token to all tokens
-            next_token_texts = []
-            left = 0
-
-            if n_accepted_ids > 1:
-                log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")
-
-            current_stopped = False
-            for j in range(index, index + n_accepted_ids):
-                # Generated token
-                next_token_id = next_token_ids[j]
-                all_input_ids.append(next_token_id)
-                next_token_text, prefix_offset, read_offset = self.decode_token(
-                    all_input_ids,
-                    prefix_offset,
-                    read_offset,
-                )
-                next_token_texts.append(next_token_text)
-
-                stop, reason = stopping_criteria(
-                    next_token_id,
-                    next_token_text,
-                )
-
-                if stop:
-                    left = index + n_accepted_ids - j - 1
-                    current_stopped = True
-                    break
-                else:
-                    current_stopped = False
-            stopped = stopped and current_stopped
-
-            _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
-            _next_token_logprobs = next_token_logprobs[
-                index : index + n_accepted_ids - left
-            ]
-            index += n_accepted_ids
-
-            # Shard generations
-            # All generations will be appended in the rust sharded client
-            if i % self.world_size == self.rank:
-                if stop:
-                    # Decode generated tokens
-                    output_text, _, _ = self.decode_token(
-                        all_input_ids,
-                        prefix_offset=len(all_input_ids)
-                        - stopping_criteria.current_tokens
-                        - 1,
-                        read_offset=len(all_input_ids)
-                        - stopping_criteria.current_tokens,
-                        skip_special_tokens=True,
-                    )
-                    generated_text = GeneratedText(
-                        output_text,
-                        stopping_criteria.current_tokens,
-                        reason,
-                        seed if do_sample else None,
-                    )
-                else:
-                    generated_text = None
-
+            # Compute logprobs first as, even though we might skip the token,
+            # it can still be required to compute the logprobs
+            # modulo on request.id as it is robust to batch.filter whereas the index in the batch is not and we need
+            # this state to be stable
+            if request.id % self.world_size == self.rank:
                 # Prefill
-                if prefill and request.prefill_logprobs:
+                if request_was_prefilling and request.prefill_logprobs:
                     out_start_index = batch.prefill_cu_outlens[i]
                     out_end_index = batch.prefill_cu_outlens[i + 1]
+                    if not request_is_prefilling:
+                        # The request is dones prefilling, meaning that we started generating new tokens
+                        # The last logprob is a logprob for a generated token that was not part of the prompt
+                        # We need to remove it
+                        out_end_index -= 1
+
+                    request_prefill_logprobs = prefill_logprobs[
+                        out_start_index:out_end_index
+                    ]
+                    # Logprobs generated by the model are for the next token
+                    # So we need to translate the id tensor by 1
+                    prefill_token_ids = all_input_ids[
+                        cache_length + 1 : cache_length + input_length + 1
+                    ]
+
+                    past_prefill_logprob_tokens = batch.prefill_logprob_tokens[i]
+
+                    if past_prefill_logprob_tokens is None:
+                        # add nan for cached prompt tokens/first token
+                        request_prefill_logprobs = [float("nan")] * (
+                            cache_length + 1
+                        ) + request_prefill_logprobs
+                        prefill_token_ids = (
+                            all_input_ids[: cache_length + 1] + prefill_token_ids
+                        )
 
-                    # Remove generated token to only have prefill and add nan for first prompt token
-                    request_prefill_logprobs = (
-                        [float("nan")] * (len(prefix_ids) + 1)
-                    ) + prefill_logprobs[out_start_index : out_end_index - 1]
-                    prefill_token_ids = all_input_ids[:-1]
                     prefill_texts = self.tokenizer.batch_decode(
-                        prefix_ids + prefill_token_ids,
+                        prefill_token_ids,
                         clean_up_tokenization_spaces=False,
                         skip_special_tokens=False,
                     )
 
-                    prefill_tokens = Tokens(
-                        prefix_ids + prefill_token_ids,
+                    prefill_logprob_tokens = Tokens(
+                        prefill_token_ids,
                         request_prefill_logprobs,
                         prefill_texts,
                         is_special=[],
                     )
+                    if past_prefill_logprob_tokens is not None:
+                        prefill_logprob_tokens = (
+                            past_prefill_logprob_tokens + prefill_logprob_tokens
+                        )
+
+                    batch.prefill_logprob_tokens[i] = prefill_logprob_tokens
                 else:
-                    prefill_tokens = None
-
-                if top_n_tokens > 0:
-                    all_top_tokens = []
-                    for top_token_ids, top_token_logprobs in zip(
-                        top_token_ids, top_token_logprobs
-                    ):
-                        toptoken_texts = self.tokenizer.batch_decode(
-                            top_token_ids,
-                            clean_up_tokenization_spaces=False,
-                            skip_special_tokens=False,
+                    batch.prefill_logprob_tokens[i] = None
+
+            # If it is, the tokens we decoded should be ignored
+            if request_is_prefilling:
+                # Make sure that we do not stop as even though this request did not create a token, it is still
+                # processing
+                stopped = False
+                new_input_length = next_chunk_lengths[i]
+            else:
+                new_input_length = n_accepted_ids
+                # Append next token to all tokens
+                next_token_texts = []
+                left = 0
+
+                if n_accepted_ids > 1:
+                    log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}")
+
+                current_stopped = False
+                for j in range(index, index + n_accepted_ids):
+                    # Generated token
+                    next_token_id = next_token_ids[j]
+                    all_input_ids.append(next_token_id)
+                    next_token_text, prefix_offset, read_offset = self.decode_token(
+                        all_input_ids,
+                        prefix_offset,
+                        read_offset,
+                    )
+                    next_token_texts.append(next_token_text)
+
+                    stop, reason = stopping_criteria(
+                        next_token_id,
+                        next_token_text,
+                    )
+
+                    if stop:
+                        left = index + n_accepted_ids - j - 1
+                        current_stopped = True
+                        break
+                    else:
+                        current_stopped = False
+                stopped = stopped and current_stopped
+
+                _next_token_ids = next_token_ids[index : index + n_accepted_ids - left]
+                _next_token_logprobs = next_token_logprobs[
+                    index : index + n_accepted_ids - left
+                ]
+
+                # Shard generations
+                # All generations will be appended in the rust sharded client
+                if request.id % self.world_size == self.rank:
+                    if stop:
+                        # Decode generated tokens
+                        output_text, _, _ = self.decode_token(
+                            all_input_ids,
+                            prefix_offset=len(all_input_ids)
+                            - stopping_criteria.current_tokens
+                            - 1,
+                            read_offset=len(all_input_ids)
+                            - stopping_criteria.current_tokens,
+                            skip_special_tokens=True,
                         )
-                        special_toptokens = [
-                            token_id in self.all_special_ids
-                            for token_id in top_token_ids
-                        ]
-                        top_tokens = Tokens(
-                            top_token_ids,
-                            top_token_logprobs,
-                            toptoken_texts,
-                            special_toptokens,
+                        generated_text = GeneratedText(
+                            output_text,
+                            stopping_criteria.current_tokens,
+                            reason,
+                            seed if do_sample else None,
                         )
-                        all_top_tokens.append(top_tokens)
-                    top_tokens = all_top_tokens
-                else:
-                    top_tokens = None
-
-                generation = Generation(
-                    request.id,
-                    prefill_tokens,
-                    Tokens(
-                        _next_token_ids,
-                        _next_token_logprobs,
-                        next_token_texts,
-                        [nid in self.all_special_ids for nid in _next_token_ids],
-                    ),
-                    generated_text,
-                    top_tokens,
-                )
+                    else:
+                        generated_text = None
+
+                    if top_n_tokens > 0:
+                        all_top_tokens = []
+                        for top_token_ids, top_token_logprobs in zip(
+                            top_token_ids, top_token_logprobs
+                        ):
+                            toptoken_texts = self.tokenizer.batch_decode(
+                                top_token_ids,
+                                clean_up_tokenization_spaces=False,
+                                skip_special_tokens=False,
+                            )
+                            special_toptokens = [
+                                token_id in self.all_special_ids
+                                for token_id in top_token_ids
+                            ]
+                            top_tokens = Tokens(
+                                top_token_ids,
+                                top_token_logprobs,
+                                toptoken_texts,
+                                special_toptokens,
+                            )
+                            all_top_tokens.append(top_tokens)
+                        top_tokens = all_top_tokens
+                    else:
+                        top_tokens = None
+
+                    generation = Generation(
+                        request.id,
+                        batch.prefill_logprob_tokens[i],
+                        Tokens(
+                            _next_token_ids,
+                            _next_token_logprobs,
+                            next_token_texts,
+                            [nid in self.all_special_ids for nid in _next_token_ids],
+                        ),
+                        generated_text,
+                        top_tokens,
+                    )
 
-                generations.append(generation)
+                    generations.append(generation)
 
-            # accept each new token for this specific request since we may
-            # have more than one new token per request with speculative decoding
-            for next_token_id in _next_token_ids:
-                batch.next_token_chooser = (
-                    batch.next_token_chooser.advance_grammar_single(i, next_token_id)
-                )
+                # accept each new token for this specific request since we may
+                # have more than one new token per request with speculative decoding
+                for next_token_id in _next_token_ids:
+                    batch.next_token_chooser = (
+                        batch.next_token_chooser.advance_grammar_single(
+                            i, next_token_id
+                        )
+                    )
 
             # Update values
-            batch.input_lengths[i] = input_length + n_accepted_ids
-            if batch.input_lengths[i] > batch.max_seqlen:
-                batch.max_seqlen = batch.input_lengths[i]
+            index += n_accepted_ids
+            current_cache_length = cache_length + input_length
+            batch.cache_lengths[i] = current_cache_length
+            current_input_length = new_input_length
+            batch.max_input_length = max(batch.max_input_length, current_input_length)
+            batch.input_lengths[i] = current_input_length
+            current_length = current_cache_length + current_input_length
+            batch.max_current_length = max(batch.max_current_length, current_length)
+
             batch.prefix_offsets[i] = prefix_offset
             batch.read_offsets[i] = read_offset
             batch.all_input_ids[i] = all_input_ids
@@ -1880,9 +2224,13 @@ def generate_token(
             decode_ns = time.time_ns() - start_decode
             return generations, None, (forward_ns, decode_ns)
 
-        batch.prefill_cu_outlens = None
-        batch.prefill_head_indices = None
-        batch.prefill_next_token_indices = None
+        if prefill and finished_prefilling:
+            # We do not need prefill tensors anymore
+            batch.cu_seqlen_prefill = None
+            batch.prefill_cache_indices = None
+            batch.prefill_cu_outlens = None
+            batch.prefill_head_indices = None
+            batch.prefill_next_token_indices = None
 
         forward_ns = start_decode - start
         decode_ns = time.time_ns() - start_decode
@@ -1894,7 +2242,7 @@ def _forward_context(
         block_tables: torch.Tensor,
         cu_seqlen_prefill: Optional[torch.Tensor],
         input_lengths_tensor: torch.Tensor,
-        prefix_lens_tensor: torch.Tensor,
+        cache_lengths_tensor: torch.Tensor,
         state: Optional[Any] = None,
     ) -> ContextManager:
         if ATTENTION != "flashinfer":
@@ -1905,8 +2253,6 @@ def _forward_context(
             use_prefill_with_paged_kv_state,
         )
 
-        # has_prefix_lens = any(prefix_len > 0 for prefix_len in prefix_lens)
-
         if cu_seqlen_prefill is not None:
             return use_prefill_with_paged_kv_state(
                 state=(
@@ -1915,11 +2261,11 @@ def _forward_context(
                 # block_tables=block_tables_to_ragged(
                 #     block_tables=block_tables,
                 #     input_lengths=input_lengths,
-                #     prefix_lens=prefix_lens,
+                #     cache_lengths=cache_lengths,
                 # ),
                 block_tables=block_tables,
                 cu_seqlens=cu_seqlen_prefill,
-                input_lengths=input_lengths_tensor + prefix_lens_tensor,
+                input_lengths=input_lengths_tensor + cache_lengths_tensor,
                 num_heads=self.num_heads,
                 num_kv_heads=self.num_kv_heads,
                 head_size=self.head_size,
@@ -1931,31 +2277,32 @@ def _forward_context(
             assert input_lengths_tensor is not None
             return use_decode_state(
                 state=state if state is not None else self.decode_state,
-                input_lengths=input_lengths_tensor + prefix_lens_tensor,
+                input_lengths=input_lengths_tensor + cache_lengths_tensor,
                 block_tables=block_tables,
                 num_heads=self.num_heads,
                 num_kv_heads=self.num_kv_heads,
                 head_size=self.head_size,
                 page_size=BLOCK_SIZE,
+                kv_cache_dtype=self.kv_cache_dtype,
                 dtype=self.dtype,
                 window_left=self.sliding_window,
             )
 
 
 def block_tables_to_ragged(
-    *, block_tables: torch.Tensor, input_lengths: List[int], prefix_lens: List[int]
+    *, block_tables: torch.Tensor, input_lengths: List[int], cache_lengths: List[int]
 ) -> torch.Tensor:
     """Convert block table to ragged format compatible with FlashInfer."""
-    assert len(input_lengths) == len(prefix_lens)
+    assert len(input_lengths) == len(cache_lengths)
 
-    total_len = sum(input_lengths) + sum(prefix_lens)
+    total_len = sum(input_lengths) + sum(cache_lengths)
     block_tables_ragged = torch.empty(
         total_len, dtype=torch.int32, device=block_tables.device
     )
 
     offset = 0
-    for i, (input_length, prefix_len) in enumerate(zip(input_lengths, prefix_lens)):
-        seq_len = prefix_len + input_length
+    for i, (input_length, cache_length) in enumerate(zip(input_lengths, cache_lengths)):
+        seq_len = cache_length + input_length
         block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len]
         offset += seq_len
 
diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py
index 6c518c2caa5..4ac6a6b499f 100644
--- a/server/text_generation_server/models/globals.py
+++ b/server/text_generation_server/models/globals.py
@@ -5,9 +5,14 @@
 
 from text_generation_server.utils.log import log_master
 
-PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING").lower() in {"1", "true"}
+ATTENTION = os.environ["ATTENTION"]
+# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0"
+PREFIX_CACHING = os.environ["PREFIX_CACHING"].lower() in {
+    "1",
+    "true",
+}
+PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "0").lower() in {"1", "true"}
 log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}")
-ATTENTION = os.getenv("ATTENTION")
 _expected = {"paged", "flashdecoding", "flashinfer"}
 assert (
     ATTENTION in _expected
@@ -18,7 +23,7 @@
     raise RuntimeError("Prefix caching is only supported with flashinfer")
 
 MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None
-TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95"))
+TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90"))
 assert TGI_WIGGLE_ROOM > 0
 assert TGI_WIGGLE_ROOM < 1
 
diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py
index 9a7a6fe156a..34b74ba8856 100644
--- a/server/text_generation_server/models/idefics_causal_lm.py
+++ b/server/text_generation_server/models/idefics_causal_lm.py
@@ -83,6 +83,7 @@ def to_pb(self) -> generate_pb2.CachedBatch:
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self),
         )
 
     @classmethod
diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py
index f6dcde68ab6..dfc61fb8875 100644
--- a/server/text_generation_server/models/mamba.py
+++ b/server/text_generation_server/models/mamba.py
@@ -116,6 +116,7 @@ def to_pb(self) -> generate_pb2.CachedBatch:
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self),
         )
 
     @classmethod
diff --git a/server/text_generation_server/models/mllama_causal_lm.py b/server/text_generation_server/models/mllama_causal_lm.py
index 9e19e171503..6399f92c14c 100644
--- a/server/text_generation_server/models/mllama_causal_lm.py
+++ b/server/text_generation_server/models/mllama_causal_lm.py
@@ -1,14 +1,17 @@
-from io import BytesIO
-from PIL import Image
 import torch
+
+import numpy as np
+
 from typing import Iterable, Optional, Tuple, List, Dict
 from text_generation_server.pb.generate_pb2 import Request
-
+from io import BytesIO
+from PIL import Image
 from dataclasses import dataclass
 from opentelemetry import trace
 from transformers import (
     PreTrainedTokenizerBase,
 )
+
 from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM
 from text_generation_server.pb import generate_pb2
 from text_generation_server.models.flash_causal_lm import (
@@ -167,6 +170,13 @@ def from_pb_processor(
         batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp(
             max=config.text_config.vocab_size - 1
         )
+        if isinstance(batch.input_ids, list):
+            if len(batch) > 1:
+                input_ids = np.concatenate(batch.input_ids, dtype=np.int64)
+            else:
+                input_ids = batch.input_ids[0]
+            batch.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+
         batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1)
 
         if image_inputs is not None:
@@ -190,7 +200,7 @@ def from_pb_processor(
 class MllamaCausalLM(VlmCausalLM):
     def forward(
         self,
-        batch: VlmCausalLMBatch,
+        batch: MllamaCausalLMBatch,
         adapter_data: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # Model Forward
@@ -202,7 +212,7 @@ def forward(
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
             speculative_ids = batch.speculative_ids
@@ -221,8 +231,8 @@ def forward(
             input_lengths = (
                 input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
             ).view(-1)
-            prefix_lens_tensor = (
-                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
             ).reshape(-1)
 
             # Add Copy the block tables for all members
@@ -244,8 +254,8 @@ def forward(
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            prefix_lens_tensor = batch.prefix_lens_tensor
-            max_s = batch.max_seqlen
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
         if cu_seqlen_prefill is None and self.max_past() is not None:
@@ -254,7 +264,6 @@ def forward(
             # This makes sure the max_s for the decode pass is correct.
             max_s = min(self.max_past(), max_s)
 
-        bs = input_ids.shape[0]
         # Try to find an associated cuda graph
         bs = input_ids.shape[0]
         sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
@@ -269,26 +278,24 @@ def forward(
             # Only run cuda graphs when there's no images.
             or batch.cross_attention_states is not None
         ):
-            input_lengths = input_lengths + prefix_lens_tensor
             if PREFIX_CACHING:
                 block_tables = block_tables_to_ragged(
                     block_tables=block_tables,
                     input_lengths=batch.input_lengths,
-                    prefix_lens=batch.prefix_lens,
+                    cache_lengths=batch.cache_lengths,
                 )
             with self._forward_context(
                 block_tables=block_tables,
                 cu_seqlen_prefill=cu_seqlen_prefill,
                 input_lengths_tensor=input_lengths,
-                prefix_lens_tensor=prefix_lens_tensor,
+                cache_lengths_tensor=cache_lengths_tensor,
             ):
-                max_k = (input_lengths + prefix_lens_tensor).max().item()
                 seqlen = Seqlen(
                     input_lengths=input_lengths,
-                    prefix_lengths=prefix_lens_tensor,
+                    cache_lengths=cache_lengths_tensor,
                     cu_seqlen_q=cu_seqlen_prefill,
-                    max_q=max_s,
-                    max_k=max_k,
+                    max_q=batch.max_input_length,
+                    max_k=batch.max_current_length,
                 )
 
                 if batch.pixel_values is not None:
@@ -330,22 +337,34 @@ def forward(
             block_tables = block_tables_to_ragged(
                 block_tables=block_tables,
                 input_lengths=batch.input_lengths,
-                prefix_lens=batch.prefix_lens,
+                cache_lengths=batch.cache_lengths,
             )
             cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
         else:
             cuda_graph["block_tables"][
                 : block_tables.shape[0], : block_tables.shape[1]
             ] = block_tables
+
+        # XXX: This is working only because block 0 is reserved for the healthcheck
+        # so it doesn't matter if we override it with bogus values.
         cuda_graph["slots"].fill_(0)
         cuda_graph["slots"][: slots.shape[0]] = slots
         cuda_graph["input_lengths"].zero_()
-        cuda_graph["input_lengths"][: input_lengths.shape[0]] = (
-            input_lengths + prefix_lens_tensor
-        )
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+        cuda_graph["cache_lengths"].zero_()
+        cuda_graph["cache_lengths"][
+            : cache_lengths_tensor.shape[0]
+        ] = cache_lengths_tensor
 
-        # Replay the graph
-        cuda_graph["graph"].replay()
+        with self._forward_context(
+            block_tables=cuda_graph["block_tables"],
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=cuda_graph["input_lengths"],
+            cache_lengths_tensor=cuda_graph["cache_lengths"],
+            state=cuda_graph["state"],
+        ):
+            # Replay the graph
+            cuda_graph["graph"].replay()
 
         # Slice output to the correct shape
         speculative_logits = (
diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py
index 20402e07afa..b3630013568 100644
--- a/server/text_generation_server/models/model.py
+++ b/server/text_generation_server/models/model.py
@@ -5,8 +5,17 @@
 from typing import List, Tuple, Optional, TypeVar, Type, Dict
 from collections import defaultdict
 from transformers import PreTrainedTokenizerBase
-
+from loguru import logger
+
+from text_generation_server.models.globals import (
+    ATTENTION,
+    PREFIX_CACHING,
+    BLOCK_SIZE,
+    PREFILL_CHUNKING,
+)
 from text_generation_server.models.types import Batch, Generation
+from text_generation_server.utils.log import log_master
+from text_generation_server.utils.prefill_chunking import set_support_chunking
 from text_generation_server.utils.speculate import get_speculate
 from text_generation_server.pb.generate_pb2 import InfoResponse
 from text_generation_server.adapters.weights import LayerAdapterWeights
@@ -31,6 +40,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         speculate: Optional[int] = None,
         adapter_id: str = BASE_MODEL_ADAPTER_ID,
+        support_chunking: bool = False,
     ):
         self.model_id = model_id
         self.model = model.eval()
@@ -60,6 +70,29 @@ def __init__(
             speculate = get_speculate()
         self.speculate = speculate
 
+        support_chunking = support_chunking and PREFILL_CHUNKING
+
+        if speculate != 0 and support_chunking:
+            log_master(
+                logger.warning,
+                "Prefill chunking does not support speculation yet. "
+                "Prefill chunking will be turned off",
+            )
+            support_chunking = False
+        if ATTENTION not in ["flashinfer", "flashdecoding"] and support_chunking:
+            log_master(
+                logger.warning,
+                "Prefill chunking is only supported with `flashinfer` or `flashdecoding` attention types.",
+            )
+            support_chunking = False
+
+        log_master(
+            logger.info, f"Using experimental prefill chunking = {support_chunking}"
+        )
+
+        self.support_chunking = support_chunking
+        set_support_chunking(support_chunking)
+
         self.has_position_ids = (
             inspect.signature(model.forward).parameters.get("position_ids", None)
             is not None
@@ -78,6 +111,10 @@ def info(self) -> InfoResponse:
             device_type=self.device.type,
             window_size=self.sliding_window,
             speculate=self.speculate,
+            support_chunking=self.support_chunking,
+            use_prefix_caching=PREFIX_CACHING,
+            attention_impl=ATTENTION,
+            block_size=BLOCK_SIZE,
         )
 
     @property
diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py
index 94f87d02350..3880a438c07 100644
--- a/server/text_generation_server/models/seq2seq_lm.py
+++ b/server/text_generation_server/models/seq2seq_lm.py
@@ -80,6 +80,7 @@ def to_pb(self) -> generate_pb2.CachedBatch:
             request_ids=[r.id for r in self.requests],
             size=len(self),
             max_tokens=self.max_tokens,
+            current_tokens=len(self.decoder_input_ids),
         )
 
     @classmethod
@@ -649,11 +650,7 @@ def fallback(
             model_id,
             revision=revision,
             torch_dtype=dtype,
-            device_map=(
-                "auto"
-                if device_count > 1
-                else None
-            ),
+            device_map=("auto" if device_count > 1 else None),
             load_in_8bit=quantize == "bitsandbytes",
             trust_remote_code=trust_remote_code,
         )
diff --git a/server/text_generation_server/models/types.py b/server/text_generation_server/models/types.py
index d4e7cca7504..ed9ae98959c 100644
--- a/server/text_generation_server/models/types.py
+++ b/server/text_generation_server/models/types.py
@@ -74,6 +74,14 @@ def to_pb(self) -> generate_pb2.Tokens:
     def __len__(self):
         return len(self.token_ids)
 
+    def __add__(self, other: "Tokens") -> "Tokens":
+        return Tokens(
+            self.token_ids + other.token_ids,
+            self.logprobs + other.logprobs,
+            self.texts + other.texts,
+            self.is_special + other.is_special,
+        )
+
 
 @dataclass
 class Generation:
diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py
index 7f7d2e4d9f4..150cf0d07d7 100644
--- a/server/text_generation_server/models/vlm_causal_lm.py
+++ b/server/text_generation_server/models/vlm_causal_lm.py
@@ -271,6 +271,8 @@ def __init__(
             model_id=model_id,
             revision=revision,
             trust_remote_code=trust_remote_code,
+            # FIXME: VLM do not work with context chunking yet
+            support_chunking=False,
             **kwargs,
         )
 
@@ -295,7 +297,7 @@ def forward(
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            max_s = batch.max_seqlen
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
             speculative_ids = batch.speculative_ids
@@ -314,8 +316,8 @@ def forward(
             input_lengths = (
                 input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int
             ).view(-1)
-            prefix_lens_tensor = (
-                batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length)
+            cache_lengths_tensor = (
+                batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length)
             ).reshape(-1)
 
             # Add Copy the block tables for all members
@@ -337,8 +339,8 @@ def forward(
             block_tables = batch.block_tables_tensor
             slots = batch.slots[batch.slot_indices]
             input_lengths = batch.input_lengths_tensor
-            prefix_lens_tensor = batch.prefix_lens_tensor
-            max_s = batch.max_seqlen
+            cache_lengths_tensor = batch.cache_lengths_tensor
+            max_s = batch.max_current_length
             lm_head_indices = batch.prefill_head_indices
 
         if cu_seqlen_prefill is None and self.max_past() is not None:
@@ -347,7 +349,6 @@ def forward(
             # This makes sure the max_s for the decode pass is correct.
             max_s = min(self.max_past(), max_s)
 
-        bs = input_ids.shape[0]
         # Try to find an associated cuda graph
         bs = input_ids.shape[0]
         sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs])
@@ -357,26 +358,24 @@ def forward(
         else:
             cuda_graph = None
         if cu_seqlen_prefill is not None or cuda_graph is None:
-            input_lengths = input_lengths + prefix_lens_tensor
-            if PREFIX_CACHING:
+            if ATTENTION == "flashinfer":
                 block_tables = block_tables_to_ragged(
                     block_tables=block_tables,
                     input_lengths=batch.input_lengths,
-                    prefix_lens=batch.prefix_lens,
+                    cache_lengths=batch.cache_lengths,
                 )
             with self._forward_context(
                 block_tables=block_tables,
                 cu_seqlen_prefill=cu_seqlen_prefill,
                 input_lengths_tensor=input_lengths,
-                prefix_lens_tensor=prefix_lens_tensor,
+                cache_lengths_tensor=cache_lengths_tensor,
             ):
-                max_k = (input_lengths + prefix_lens_tensor).max().item()
                 seqlen = Seqlen(
                     input_lengths=input_lengths,
-                    prefix_lengths=prefix_lens_tensor,
+                    cache_lengths=cache_lengths_tensor,
                     cu_seqlen_q=cu_seqlen_prefill,
-                    max_q=max_s,
-                    max_k=max_k,
+                    max_q=batch.max_input_length,
+                    max_k=batch.max_current_length,
                 )
                 logits, speculative_logits = self.model.forward(
                     input_ids=input_ids,
@@ -411,22 +410,34 @@ def forward(
             block_tables = block_tables_to_ragged(
                 block_tables=block_tables,
                 input_lengths=batch.input_lengths,
-                prefix_lens=batch.prefix_lens,
+                cache_lengths=batch.cache_lengths,
             )
             cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables
         else:
             cuda_graph["block_tables"][
                 : block_tables.shape[0], : block_tables.shape[1]
             ] = block_tables
-        cuda_graph["slots"].fill_(-1)
+
+        # XXX: This is working only because block 0 is reserved for the healthcheck
+        # so it doesn't matter if we override it with bogus values.
+        cuda_graph["slots"].fill_(0)
         cuda_graph["slots"][: slots.shape[0]] = slots
         cuda_graph["input_lengths"].zero_()
-        cuda_graph["input_lengths"][: input_lengths.shape[0]] = (
-            input_lengths + prefix_lens_tensor
-        )
-
-        # Replay the graph
-        cuda_graph["graph"].replay()
+        cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths
+        cuda_graph["cache_lengths"].zero_()
+        cuda_graph["cache_lengths"][
+            : cache_lengths_tensor.shape[0]
+        ] = cache_lengths_tensor
+
+        with self._forward_context(
+            block_tables=cuda_graph["block_tables"],
+            cu_seqlen_prefill=None,
+            input_lengths_tensor=cuda_graph["input_lengths"],
+            cache_lengths_tensor=cuda_graph["cache_lengths"],
+            state=cuda_graph["state"],
+        ):
+            # Replay the graph
+            cuda_graph["graph"].replay()
 
         # Slice output to the correct shape
         speculative_logits = (
diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py
index 46e342a4fbf..aef00fb5f5d 100644
--- a/server/text_generation_server/server.py
+++ b/server/text_generation_server/server.py
@@ -15,6 +15,7 @@
 from text_generation_server.interceptor import ExceptionInterceptor
 from text_generation_server.models import Model, get_model_with_lora_adapters
 from text_generation_server.utils.adapter import AdapterInfo
+from text_generation_server.utils.prefill_chunking import set_max_prefill_tokens
 
 try:
     from text_generation_server.models.pali_gemma import PaliGemmaBatch
@@ -46,9 +47,12 @@ def __init__(self):
         signal.signal(signal.SIGINT, self.exit_gracefully)
         signal.signal(signal.SIGTERM, self.exit_gracefully)
 
+    def set_keep_processing(self, value: bool):
+        self.KEEP_PROCESSING = value
+
     def exit_gracefully(self, signum, frame):
         print(f"Exiting gracefully: Signal {signum}")
-        self.KEEP_PROCESSING = False
+        self.set_keep_processing(False)
 
 
 class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer):
@@ -96,6 +100,8 @@ async def FilterBatch(self, request, context):
         return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
 
     async def Warmup(self, request, context):
+        set_max_prefill_tokens(request.max_prefill_tokens)
+
         if self.quantize in {"exl2", "gptq"}:
             try:
                 # When using GPTQ, Exllama kernels need some global kernels
@@ -150,6 +156,18 @@ async def Prefill(self, request, context):
                 request.batch, self.model.tokenizer, self.model.dtype, self.model.device
             )
 
+        concat_ns = None
+        if self.model.support_chunking:
+            if request.HasField("cached_batch"):
+                cached_batch = self.cache.pop(request.cached_batch.id)
+                if cached_batch is None:
+                    raise ValueError(
+                        f"Batch ID {request.cached_batch.id} not found in cache."
+                    )
+                start_concat = time.time_ns()
+                batch = self.model.batch_type.concatenate([cached_batch, batch])
+                concat_ns = time.time_ns() - start_concat
+
         generations, next_batch, timings = self.model.generate_token(batch)
         self.cache.set(next_batch)
 
@@ -159,6 +177,7 @@ async def Prefill(self, request, context):
             forward_ns=timings[0],
             decode_ns=timings[1],
             total_ns=time.time_ns() - start,
+            concat_ns=concat_ns,
         )
 
     async def Decode(self, request, context):
@@ -252,10 +271,12 @@ async def serve_inner(
             logger.exception("Error when initializing model")
             raise
 
+        signal_handler = SignalHandler()
+
         set_adapter_to_index(adapter_to_index)
         server = aio.server(
             interceptors=[
-                ExceptionInterceptor(),
+                ExceptionInterceptor(lambda: signal_handler.set_keep_processing(False)),
                 UDSOpenTelemetryAioServerInterceptor(),
             ],
             options=[
@@ -276,7 +297,6 @@ async def serve_inner(
         await server.start()
 
         logger.info("Server started at {}".format(local_url))
-        signal_handler = SignalHandler()
         while signal_handler.KEEP_PROCESSING:
             await asyncio.sleep(0.5)
 
diff --git a/server/text_generation_server/utils/adapter.py b/server/text_generation_server/utils/adapter.py
index 2b61f9bb448..09254b68a43 100644
--- a/server/text_generation_server/utils/adapter.py
+++ b/server/text_generation_server/utils/adapter.py
@@ -120,15 +120,18 @@ def _load_and_merge(
         if adapter.id == BASE_MODEL_ADAPTER_ID:
             raise ValueError("Base model adapter cannot be merged.")
 
-        module_map, adapter_config, adapter_weight_names, adapter_tokenizer = (
-            load_module_map(
-                model_id,
-                adapter.revision,
-                adapter.id,
-                adapter.path,
-                weight_names,
-                trust_remote_code,
-            )
+        (
+            module_map,
+            adapter_config,
+            adapter_weight_names,
+            adapter_tokenizer,
+        ) = load_module_map(
+            model_id,
+            adapter.revision,
+            adapter.id,
+            adapter.path,
+            weight_names,
+            trust_remote_code,
         )
 
         adapters_to_merge.append((module_map, adapter_config))
diff --git a/server/text_generation_server/utils/prefill_chunking.py b/server/text_generation_server/utils/prefill_chunking.py
new file mode 100644
index 00000000000..c227d30f512
--- /dev/null
+++ b/server/text_generation_server/utils/prefill_chunking.py
@@ -0,0 +1,24 @@
+from typing import Optional
+
+SUPPORT_CHUNKING: Optional[bool] = None
+MAX_PREFILL_TOKENS: Optional[int] = None
+
+
+def set_support_chunking(support_chunking: bool):
+    global SUPPORT_CHUNKING
+    SUPPORT_CHUNKING = support_chunking
+
+
+def get_support_chunking() -> bool:
+    global SUPPORT_CHUNKING
+    return SUPPORT_CHUNKING
+
+
+def set_max_prefill_tokens(max_prefill_tokens: int):
+    global MAX_PREFILL_TOKENS
+    MAX_PREFILL_TOKENS = max_prefill_tokens
+
+
+def get_max_prefill_tokens() -> int:
+    global MAX_PREFILL_TOKENS
+    return MAX_PREFILL_TOKENS
diff --git a/server/text_generation_server/utils/segments.py b/server/text_generation_server/utils/segments.py
index f5961102108..b3f923694e3 100644
--- a/server/text_generation_server/utils/segments.py
+++ b/server/text_generation_server/utils/segments.py
@@ -7,6 +7,7 @@
 import torch
 
 
+# FIXME: this should be optimized
 def find_segments(
     adapter_indices: Union[torch.Tensor, List[int]]
 ) -> Tuple[List[int], List[int]]:
diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py
index 75e01f7ce05..aae64acf3da 100644
--- a/server/text_generation_server/utils/weights.py
+++ b/server/text_generation_server/utils/weights.py
@@ -197,7 +197,7 @@ def _get_slice(self, tensor_name: str):
         slice_ = f.get_slice(tensor_name)
         return slice_
 
-    def _has_tensor(self, tensor_name: str):
+    def has_tensor(self, tensor_name: str):
         try:
             self.get_filename(tensor_name)
         except Exception:
@@ -207,7 +207,9 @@ def _has_tensor(self, tensor_name: str):
     def get_shape(self, tensor_name: str):
         return self._get_slice(tensor_name).get_shape()
 
-    def get_tensor(self, tensor_name: str, to_device=True, to_dtype=True):
+    def get_tensor(
+        self, tensor_name: str, to_device: bool = True, to_dtype: bool = True
+    ) -> torch.Tensor:
         filename, tensor_name = self.get_filename(tensor_name)
         f = self._get_handle(filename)
         tensor = f.get_tensor(tensor_name)
diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh
index ea94dcd9fd4..278c7d961ca 100755
--- a/tgi-entrypoint.sh
+++ b/tgi-entrypoint.sh
@@ -2,4 +2,4 @@
 
 ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases'
 
-text-generation-launcher $@
+exec text-generation-launcher $@
diff --git a/update_doc.py b/update_doc.py
index 203aaced025..6357cc0061b 100644
--- a/update_doc.py
+++ b/update_doc.py
@@ -172,6 +172,8 @@ def check_openapi(check: bool):
             # allow for trailing whitespace since it's not significant
             # and the precommit hook will remove it
             "lint",
+            "--skip-rule",
+            "security-defined",
             filename,
         ],
         capture_output=True,