diff --git a/Cargo.lock b/Cargo.lock index 5e85e384869..c1251832983 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2706,9 +2706,9 @@ dependencies = [ [[package]] name = "opentelemetry" -version = "0.23.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b69a91d4893e713e06f724597ad630f1fa76057a5e1026c0ca67054a9032a76" +checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96" dependencies = [ "futures-core", "futures-sink", @@ -2819,19 +2819,17 @@ dependencies = [ [[package]] name = "opentelemetry_sdk" -version = "0.23.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae312d58eaa90a82d2e627fd86e075cf5230b3f11794e2ed74199ebbe572d4fd" +checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df" dependencies = [ "async-trait", "futures-channel", "futures-executor", "futures-util", "glob", - "lazy_static", "once_cell", - "opentelemetry 0.23.0", - "ordered-float 4.3.0", + "opentelemetry 0.24.0", "percent-encoding", "rand", "thiserror", @@ -4185,16 +4183,17 @@ dependencies = [ "cmake", "cxx", "cxx-build", + "hashbrown 0.14.5", + "hf-hub", "log", - "parking_lot", "pkg-config", "text-generation-router", "thiserror", - "tokenizers 0.19.1", + "tokenizers", "tokio", "tokio-stream", "tracing", - "tracing-opentelemetry 0.24.0", + "tracing-opentelemetry 0.25.0", "tracing-subscriber", ] @@ -4212,7 +4211,7 @@ dependencies = [ "tabled", "text-generation-client", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tracing", "tracing-subscriber", @@ -4292,7 +4291,7 @@ dependencies = [ "serde_json", "sysinfo", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tokio-stream", "tower-http", @@ -4341,7 +4340,7 @@ dependencies = [ "slotmap", "text-generation-router", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tokio-stream", "tonic 0.10.2", @@ -4392,7 +4391,7 @@ dependencies = [ "slotmap", "text-generation-router", "thiserror", - "tokenizers 0.20.0", + "tokenizers", "tokio", "tokio-stream", "tonic 0.10.2", @@ -4514,39 +4513,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" -[[package]] -name = "tokenizers" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e500fad1dd3af3d626327e6a3fe5050e664a6eaa4708b8ca92f1794aaf73e6fd" -dependencies = [ - "aho-corasick", - "derive_builder", - "esaxx-rs", - "getrandom", - "hf-hub", - "indicatif", - "itertools 0.12.1", - "lazy_static", - "log", - "macro_rules_attribute", - "monostate", - "onig", - "paste", - "rand", - "rayon", - "rayon-cond", - "regex", - "regex-syntax 0.8.5", - "serde", - "serde_json", - "spm_precompiled", - "thiserror", - "unicode-normalization-alignments", - "unicode-segmentation", - "unicode_categories", -] - [[package]] name = "tokenizers" version = "0.20.0" @@ -4933,14 +4899,14 @@ dependencies = [ [[package]] name = "tracing-opentelemetry" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f68803492bf28ab40aeccaecc7021096bd256baf7ca77c3d425d89b35a7be4e4" +checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b" dependencies = [ "js-sys", "once_cell", - "opentelemetry 0.23.0", - "opentelemetry_sdk 0.23.0", + "opentelemetry 0.24.0", + "opentelemetry_sdk 0.24.1", "smallvec", "tracing", "tracing-core", diff --git a/Dockerfile.trtllm b/Dockerfile.trtllm deleted file mode 100644 index 4543ae804ee..00000000000 --- a/Dockerfile.trtllm +++ /dev/null @@ -1,23 +0,0 @@ -# All the tooling for CUDA -FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 AS cuda-builder - -WORKDIR /usr/src/tgi/backends/trtllm -RUN apt update && apt install -y cmake git git-lfs gcc g++ ninja-build libopenmpi-dev python3-dev python3-pip wget - -COPY . /usr/src/tgi -RUN chmod +x scripts/install_tensorrt.sh && scripts/install_tensorrt.sh -RUN cmake -G Ninja -B build -DTRT_LIB_DIR=/usr/local/tensorrt/lib -DTRT_INCLUDE_DIR=/usr/local/tensorrt/include . -RUN cmake --build build --parallel -t tgi_trtllm_backend_impl - -# All the tooling for Rust -FROM lukemathwalker/cargo-chef:latest-rust-1.79 AS chef -WORKDIR /usr/src - -# Include CUDA related libraries and tools to the Rust based image -COPY --from=cuda-builder /usr/local/cuda /usr/local/cuda -COPY --from=cuda-builder /usr/local/tensorrt /usr/local/tensorrt -COPY --from=cuda-builder /usr/src/tgi/backends/trtllm/build /usr/local/tgi/trtllm/build -ENV PATH=/usr/local/cuda/bin:$PATH -ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH - -RUN apt update && apt install -y cmake git gcc g++ ninja-build libopenmpi3 diff --git a/Dockerfile_amd b/Dockerfile_amd index 4bb6407a31b..b84d4edd802 100644 --- a/Dockerfile_amd +++ b/Dockerfile_amd @@ -327,7 +327,8 @@ ENV ROCM_USE_CUSTOM_PAGED_ATTN=1 ENV PYTORCH_TUNABLEOP_TUNING_AFTER_WARMUP=0 ENV VLLM_MOE_PADDING=0 ENV ATTENTION=paged -ENV USE_PREFIX_CACHING=0 +ENV PREFIX_CACHING=0 +ENV PREFILL_CHUNKING=0 ENV ROCM_USE_SKINNY_GEMM=1 COPY ./tgi-entrypoint.sh /tgi-entrypoint.sh diff --git a/Dockerfile_intel b/Dockerfile_intel index 9b5dd20a79b..96f242489ab 100644 --- a/Dockerfile_intel +++ b/Dockerfile_intel @@ -218,7 +218,8 @@ COPY --from=builder /usr/src/target/release-opt/text-generation-launcher /usr/lo FROM ${PLATFORM} AS final ENV ATTENTION=paged -ENV USE_PREFIX_CACHING=0 +ENV PREFIX_CACHING=0 +ENV PREFILL_CHUNKING=0 ENV CUDA_GRAPHS=0 ENTRYPOINT ["text-generation-launcher"] CMD ["--json-output"] diff --git a/backends/trtllm/Dockerfile b/Dockerfile_trtllm similarity index 85% rename from backends/trtllm/Dockerfile rename to Dockerfile_trtllm index 5fd2f89f25f..3ccb0310bea 100644 --- a/backends/trtllm/Dockerfile +++ b/Dockerfile_trtllm @@ -10,7 +10,7 @@ COPY . . RUN cargo chef prepare --recipe-path recipe.json # CUDA dependent dependencies resolver stage -FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu22.04 AS cuda-builder +FROM nvidia/cuda:12.6.1-cudnn-devel-ubuntu22.04 AS cuda-builder RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \ @@ -26,6 +26,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ninja-build \ pkg-config \ python3 \ + python3-dev \ python3-setuptools \ tar \ wget @@ -42,7 +43,7 @@ RUN wget "https://download.open-mpi.org/release/open-mpi/v4.1/$OMPI_TARBALL_FILE mkdir /usr/src/mpi && \ tar -xf "/opt/src/$OMPI_TARBALL_FILENAME" -C /usr/src/mpi --strip-components=1 && \ cd /usr/src/mpi && \ - ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda && \ + ./configure --prefix=/usr/local/mpi --with-cuda=/usr/local/cuda --with-slurm && \ make -j all && \ make install && \ rm -rf "/opt/src/$OMPI_TARBALL_FILENAME" @@ -82,10 +83,16 @@ RUN mkdir $TGI_INSTALL_PREFIX && mkdir "$TGI_INSTALL_PREFIX/include" && mkdir "$ cd backends/trtllm && \ CMAKE_INSTALL_PREFIX=$TGI_INSTALL_PREFIX cargo build --release -FROM nvidia/cuda:12.5.1-cudnn-runtime-ubuntu22.04 AS runtime +FROM nvidia/cuda:12.6.1-cudnn-runtime-ubuntu22.04 AS runtime +RUN apt update && apt install -y python3-minimal python3-dev python3-pip && \ + rm -rf /var/lib/{apt,dpkg,cache,log}/ && \ + python3 -m pip install transformers tokenizers + WORKDIR /usr/local/tgi/bin -ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" +ENV LD_LIBRARY_PATH="/usr/local/tgi/lib:/usr/local/mpi/lib:/usr/local/tensorrt/lib:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH" +ENV TOKENIZERS_PARALLELISM=false +ENV OMPI_MCA_plm_rsh_agent="" COPY --from=mpi-builder /usr/local/mpi /usr/local/mpi COPY --from=trt-builder /usr/local/tensorrt /usr/local/tensorrt diff --git a/README.md b/README.md index 25dbbd437c6..fb475b097dd 100644 --- a/README.md +++ b/README.md @@ -98,7 +98,7 @@ curl 127.0.0.1:8080/generate_stream \ You can also use [TGI's Messages API](https://huggingface.co/docs/text-generation-inference/en/messages_api) to obtain Open AI Chat Completion API compatible responses. ```bash -curl localhost:3000/v1/chat/completions \ +curl localhost:8080/v1/chat/completions \ -X POST \ -d '{ "model": "tgi", diff --git a/backends/client/src/v3/client.rs b/backends/client/src/v3/client.rs index 479d31bf290..d43f789e7ca 100644 --- a/backends/client/src/v3/client.rs +++ b/backends/client/src/v3/client.rs @@ -158,7 +158,8 @@ impl Client { // Blocks and slots will be set on the server side if we use paged attention blocks: vec![], slots: vec![], - prefix_len: 0, + cache_len: 0, + chunk_len: None, // Set sampling parameters to also take these ops into account in the max memory parameters: Some(NextTokenChooserParameters { temperature: 0.9, @@ -217,8 +218,13 @@ impl Client { pub async fn prefill( &mut self, batch: Batch, + cached_batch: Option, ) -> Result<(Vec, Option, PrefillTimings)> { - let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context(); + let request = tonic::Request::new(PrefillRequest { + batch: Some(batch), + cached_batch, + }) + .inject_context(); let response = self.stub.prefill(request).await?.into_inner(); Ok(( response.generations, diff --git a/backends/client/src/v3/sharded_client.rs b/backends/client/src/v3/sharded_client.rs index 645c076a26b..854a5895eba 100644 --- a/backends/client/src/v3/sharded_client.rs +++ b/backends/client/src/v3/sharded_client.rs @@ -134,11 +134,12 @@ impl ShardedClient { pub async fn prefill( &mut self, batch: Batch, + cached_batch: Option, ) -> Result<(Vec, Option, PrefillTimings)> { let futures: Vec<_> = self .clients .iter_mut() - .map(|client| Box::pin(client.prefill(batch.clone()))) + .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone()))) .collect(); #[allow(clippy::type_complexity)] let results: Result, Option, PrefillTimings)>> = @@ -245,7 +246,8 @@ impl Health for ShardedClient { // Block 0 is reserved for health checks blocks: vec![0], slots: (0..16).collect(), - prefix_len: 0, + cache_len: 0, + chunk_len: None, adapter_id: None, }; let batch = Batch { @@ -255,7 +257,7 @@ impl Health for ShardedClient { max_tokens: 2, max_blocks: 1, }; - self.clone().prefill(batch).await?; + self.clone().prefill(batch, None).await?; Ok(()) } } diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 425b2d7b9a5..831372cdf99 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -1,5 +1,17 @@ cmake_minimum_required(VERSION 3.20) +if (NOT DEFINED CMAKE_CXX_COMPILER_LAUNCHER AND CMAKE_BUILD_TYPE STREQUAL "Debug") + find_program(CCACHE_EXECUTABLE "ccache") + if (CCACHE_EXECUTABLE) + message(STATUS "Using ccache") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXECUTABLE}" CACHE PATH "Path to ccache" FORCE) + endif () +endif () + +if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") + cmake_policy(SET CMP0135 NEW) +endif () + project(tgi-trtllm-backend VERSION 1.0.0) set(CMAKE_CXX_STANDARD 20) @@ -14,7 +26,7 @@ set(TGI_TRTLLM_BACKEND_TRT_INCLUDE_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/include" set(TGI_TRTLLM_BACKEND_TRT_LIB_DIR "${TGI_TRTLLM_BACKEND_TRT_ROOT}/lib" CACHE STRING "Path where TensorRT libraries are located") # We are using nvidia-ml to query at runtime device information to enable some architecture-specific features -find_package(CUDAToolkit 12.5 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml) +find_package(CUDAToolkit 12.6 REQUIRED COMPONENTS CUDA::cudart CUDA::nvml) #### External dependencies #### include(cmake/fmt.cmake) diff --git a/backends/trtllm/Cargo.toml b/backends/trtllm/Cargo.toml index 43a114ba4b0..97ef1a76891 100644 --- a/backends/trtllm/Cargo.toml +++ b/backends/trtllm/Cargo.toml @@ -10,16 +10,17 @@ async-trait = "0.1" async-stream = "0.3" clap = { version = "4.5", features = ["derive"] } cxx = "1.0" +hashbrown = "0.14" +hf-hub = { workspace = true } log = { version = "0.4", features = [] } text-generation-router = { path = "../../router" } -tokenizers = { version = "0.19", features = ["hf-hub"] } -tokio = { version = "1.38", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } +tokenizers = { workspace = true } +tokio = { version = "1.39", features = ["rt", "rt-multi-thread", "parking_lot", "signal", "sync"] } tokio-stream = "0.1.15" -thiserror = "1.0.62" +thiserror = "1.0.63" tracing = "0.1" -tracing-opentelemetry = "0.24" +tracing-opentelemetry = "0.25" tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] } -parking_lot = "0.12" [build-dependencies] cmake = "0.1" diff --git a/backends/trtllm/build.rs b/backends/trtllm/build.rs index 08638262438..985019260b8 100644 --- a/backends/trtllm/build.rs +++ b/backends/trtllm/build.rs @@ -6,7 +6,7 @@ use std::path::{absolute, PathBuf}; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; const CUDA_ARCH_LIST: Option<&str> = option_env!("CUDA_ARCH_LIST"); -const CUDA_REQUIRED_VERSION: &str = "12.5"; +const CUDA_REQUIRED_VERSION: &str = "12.6"; const MPI_REQUIRED_VERSION: &str = "4.1"; const INSTALL_PREFIX: Option<&str> = option_env!("CMAKE_INSTALL_PREFIX"); const TENSORRT_ROOT_DIR: Option<&str> = option_env!("TENSORRT_ROOT_DIR"); @@ -36,7 +36,7 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf // Build the backend implementation through CMake let install_path = INSTALL_PREFIX.unwrap_or("/usr/local/tgi"); let tensorrt_path = TENSORRT_ROOT_DIR.unwrap_or("/usr/local/tensorrt"); - let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("90-real"); // Hopper by default + let cuda_arch_list = CUDA_ARCH_LIST.unwrap_or("75-real;80-real;86-real;89-real;90-real"); let mut install_path = PathBuf::from(install_path); if !install_path.is_absolute() { @@ -81,7 +81,12 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> (PathBuf (PathBuf::from(install_path), deps_folder) } -fn build_ffi_layer(deps_folder: &PathBuf) { +fn build_ffi_layer(deps_folder: &PathBuf, is_debug: bool) { + let ndebug = match is_debug { + true => "1", + false => "0", + }; + CFG.include_prefix = "backends/trtllm"; cxx_build::bridge("src/lib.rs") .static_flag(true) @@ -93,9 +98,14 @@ fn build_ffi_layer(deps_folder: &PathBuf) { .include("/usr/local/tensorrt/include") .file("src/ffi.cpp") .std("c++20") + .define("NDEBUG", ndebug) .compile("tgi_trtllm_backend"); println!("cargo:rerun-if-changed=CMakeLists.txt"); + println!("cargo:rerun-if-changed=cmake/trtllm.cmake"); + println!("cargo:rerun-if-changed=cmake/json.cmake"); + println!("cargo:rerun-if-changed=cmake/fmt.cmake"); + println!("cargo:rerun-if-changed=cmake/spdlog.cmake"); println!("cargo:rerun-if-changed=include/backend.h"); println!("cargo:rerun-if-changed=lib/backend.cpp"); println!("cargo:rerun-if-changed=include/ffi.h"); @@ -115,7 +125,7 @@ fn main() { let (_backend_path, deps_folder) = build_backend(is_debug, opt_level, &out_dir); // Build the FFI layer calling the backend above - build_ffi_layer(&deps_folder); + build_ffi_layer(&deps_folder, is_debug); // Emit linkage search path probe!("ompi", MPI_REQUIRED_VERSION); diff --git a/backends/trtllm/cmake/fmt.cmake b/backends/trtllm/cmake/fmt.cmake index f94a9c5668f..afd6ea5f090 100644 --- a/backends/trtllm/cmake/fmt.cmake +++ b/backends/trtllm/cmake/fmt.cmake @@ -1,6 +1,6 @@ FetchContent_Declare( fmt - GIT_REPOSITORY https://github.com/fmtlib/fmt - GIT_TAG 11.0.1 + DOWNLOAD_EXTRACT_TIMESTAMP + URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz ) FetchContent_MakeAvailable(fmt) diff --git a/backends/trtllm/cmake/json.cmake b/backends/trtllm/cmake/json.cmake index 29e5753b341..67eff2fe606 100644 --- a/backends/trtllm/cmake/json.cmake +++ b/backends/trtllm/cmake/json.cmake @@ -1,5 +1,6 @@ fetchcontent_declare( json + DOWNLOAD_EXTRACT_TIMESTAMP URL https://github.com/nlohmann/json/releases/download/v3.11.3/json.tar.xz ) fetchcontent_makeavailable(json) diff --git a/backends/trtllm/cmake/spdlog.cmake b/backends/trtllm/cmake/spdlog.cmake index c4ee5c97a58..7f529a7d29e 100644 --- a/backends/trtllm/cmake/spdlog.cmake +++ b/backends/trtllm/cmake/spdlog.cmake @@ -11,7 +11,7 @@ endif () fetchcontent_declare( spdlog - GIT_REPOSITORY https://github.com/gabime/spdlog.git - GIT_TAG v1.14.1 + DOWNLOAD_EXTRACT_TIMESTAMP + URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz ) fetchcontent_makeavailable(spdlog) diff --git a/backends/trtllm/cmake/trtllm.cmake b/backends/trtllm/cmake/trtllm.cmake index e59ad4cf3a6..5f1b6c19c01 100644 --- a/backends/trtllm/cmake/trtllm.cmake +++ b/backends/trtllm/cmake/trtllm.cmake @@ -23,8 +23,9 @@ endif () fetchcontent_declare( trtllm GIT_REPOSITORY https://github.com/NVIDIA/TensorRT-LLM.git - GIT_TAG a681853d3803ee5893307e812530b5e7004bb6e1 + GIT_TAG 201135e58aa525af7e523d091d4c9584229524bc GIT_SHALLOW FALSE + DOWNLOAD_EXTRACT_TIMESTAMP ) fetchcontent_makeavailable(trtllm) diff --git a/backends/trtllm/include/backend.h b/backends/trtllm/include/backend.h index 7990e76b90d..d23f6288964 100644 --- a/backends/trtllm/include/backend.h +++ b/backends/trtllm/include/backend.h @@ -5,6 +5,7 @@ #ifndef TGI_TRTLLM_BACKEND_H #define TGI_TRTLLM_BACKEND_H +#include #include #include #include @@ -19,16 +20,33 @@ using json = nlohmann::json; namespace tle = tensorrt_llm::executor; + +#define CAST_SIZETYPE(x) static_cast(x) + namespace huggingface::tgi::backends { using RequestId = tle::IdType; using TokenId = tle::TokenIdType; + const static auto OUTPUT_CONFIG = tle::OutputConfig(true, false, false, true, false); + constexpr auto FMT_NOT_ENOUGH_GPUS = FMT_STRING( + "Not enough GPUs to allocate requested model (detected: {:d}, required: {:d})"); + constexpr auto FMT_EXECUTOR_STATS = FMT_STRING( + "Submitting inference [{}] to the executor ({:d} already in-flight)"); + constexpr auto FMT_SAMPLING_CONFIG = FMT_STRING( + "Sampling: topK={:d}, topP={:.1f}, temperature={:.1f}, repetition_penalty={:.1f}, frequency_penalty={:.1f}, seed={:d}"); + /** * Initialize all the components required by TRTLLM. * It is required to call this function before attempting to load any engine */ void InitializeBackend(); + /** + * Initialize logging mechanism + */ + void InitializeLogging(); + + /** * * @param config TensorRT-LLM configuration object @@ -37,6 +55,14 @@ namespace huggingface::tgi::backends { */ tle::ExecutorConfig GetExecutorConfig(const json &config, const std::string &workerPath); + /** + * + * @param worldSize + * @param workerPath + * @return + */ + tle::ParallelConfig GetParallelConfig(size_t worldSize, std::string workerPath) noexcept; + /** * Get the sampling configuration from the parameters provided by TGI * @param topK @@ -54,7 +80,15 @@ namespace huggingface::tgi::backends { float_t repetition_penalty, float_t frequency_penalty, uint64_t seed - ); + ) noexcept; + + /** + * Attempt to retrieve the + * @param generationConfigPath + * @return + */ + std::optional>> + GetStopWordsFromConfig(const std::filesystem::path &generationConfigPath) noexcept; /** * @@ -64,18 +98,16 @@ namespace huggingface::tgi::backends { const json config; tle::Executor executor; + /** Frequently accessed variables cached here **/ + uint32_t maxNumTokens; + std::list> stopWords; + public: explicit TensorRtLlmBackend( const std::filesystem::path &engineFolder, const std::filesystem::path &executorWorker ); - /** - * Indicate if the backend is ready to accept incoming request - * @return true if ready, false otherwise - */ - [[nodiscard]] bool IsReady() const; - /** * Query the executor for the number of token available for pulling * @return @@ -88,32 +120,23 @@ namespace huggingface::tgi::backends { * @param topK * @param topP * @param temperature - * @param repetition_penalty - * @param frequency_penalty + * @param repetitionPenalty + * @param frequencyPenalty * @param seed * @return Request id related to this generation for reference */ [[nodiscard]] RequestId Submit( const std::vector &tokens, + uint32_t maxNewTokens, int32_t topK, float_t topP, float_t temperature, - float_t repetition_penalty, - float_t frequency_penalty, + float_t repetitionPenalty, + float_t frequencyPenalty, uint64_t seed ); - /** - * - * @param requestId The request id to poll the generation results - * @return - */ - std::vector Poll(RequestId requestId); - - /** - * Stop the underlying executor - */ - void Shutdown(); + [[nodiscard]] std::vector PullNewTokens(); }; } diff --git a/backends/trtllm/include/ffi.h b/backends/trtllm/include/ffi.h index fe0be9fc820..449bcd4d739 100644 --- a/backends/trtllm/include/ffi.h +++ b/backends/trtllm/include/ffi.h @@ -5,19 +5,30 @@ #ifndef TGI_TRTLLM_BACKEND_FFI_H #define TGI_TRTLLM_BACKEND_FFI_H +#include #include +#include #include "backend.h" namespace huggingface::tgi::backends { class TensorRtLlmBackendImpl; } -#include "backends/trtllm/src/lib.rs.h" +// Template to support returning error from TllmException back to Rust in a Result<> +#include +namespace rust::behavior { + template + static void trycatch(Try &&func, Fail &&fail) noexcept try { + func(); + } catch (tensorrt_llm::common::TllmException &e) { + fail(e.what()); + } +} -namespace huggingface::tgi::backends { +#include "backends/trtllm/src/lib.rs.h" -// struct GenerationContext; +namespace huggingface::tgi::backends { class TensorRtLlmBackendImpl : public TensorRtLlmBackend { public: @@ -28,15 +39,10 @@ namespace huggingface::tgi::backends { */ TensorRtLlmBackendImpl(const std::string_view &engineFolder, const std::string_view &executorWorker); - /*** - * - * @return - */ - bool IsReady() const; - /*** * * @param tokens + * @param maxNewTokens * @param topK * @param topP * @param temperature @@ -47,21 +53,15 @@ namespace huggingface::tgi::backends { */ [[nodiscard("returned request id should be used to refer to the request's generation result later on")]] uint64_t - Submit(rust::Slice tokens, int32_t topK, float_t topP, float_t temperature, + Submit(rust::Slice tokens, uint32_t maxNewTokens, + int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty, float_t frequency_penalty, uint64_t seed); /*** * - * @param requestId - * @param ctx - * @param callback * @return */ - size_t StreamTokens( - const RequestId requestId, - huggingface::tgi::backends::GenerationContext *ctx, - rust::Fn callback); + std::unique_ptr> PullTokens(); }; /*** diff --git a/backends/trtllm/include/hardware.h b/backends/trtllm/include/hardware.h index da0bf4f3c0d..9633495f4fd 100644 --- a/backends/trtllm/include/hardware.h +++ b/backends/trtllm/include/hardware.h @@ -14,7 +14,7 @@ namespace huggingface::hardware::cuda { #define AMPERE_SM_MAJOR 8 -#define HOPPER_SM_MAJOR 8 +#define HOPPER_SM_MAJOR 9 /** * Store information about the version of the CUDA Compute Capabilities detected on the device @@ -23,9 +23,9 @@ namespace huggingface::hardware::cuda { int32_t major; int32_t minor; - [[nodiscard]] constexpr bool isPostAmpere() const { return major >= AMPERE_SM_MAJOR; } + [[nodiscard]] constexpr bool IsPostAmpere() const { return major >= AMPERE_SM_MAJOR; } - [[nodiscard]] constexpr bool isPostHopper() const { return major >= HOPPER_SM_MAJOR; } + [[nodiscard]] constexpr bool IsPostHopper() const { return major >= HOPPER_SM_MAJOR; } }; CudaComputeCapabilities GetCudaComputeCapabilities() { diff --git a/backends/trtllm/lib/backend.cpp b/backends/trtllm/lib/backend.cpp index c066a6d6eab..4dd41de0072 100644 --- a/backends/trtllm/lib/backend.cpp +++ b/backends/trtllm/lib/backend.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -7,11 +8,33 @@ #include "backend.h" #include "hardware.h" + +void huggingface::tgi::backends::InitializeLogging() { +#ifdef NDEBUG + if (const auto TRTLLM_LOG_LEVEL_CSTR = std::getenv("TRTLLM_LOG_LEVEL")) { + std::string log_level(TRTLLM_LOG_LEVEL_CSTR); + std::transform(log_level.begin(), log_level.end(), log_level.begin(), [](unsigned char c) { + return std::tolower(c); + }); + + if (log_level == "debug") + spdlog::set_level(spdlog::level::debug); + else + spdlog::set_level(spdlog::level::info); + } +#else + spdlog::set_level(spdlog::level::debug); +#endif +} + void huggingface::tgi::backends::InitializeBackend() { SPDLOG_INFO("Initializing Backend..."); nvmlInit_v2(); initTrtLlmPlugins(); + InitializeLogging(); + + SPDLOG_INFO("Backend Executor Version: {}", tle::version()); const auto numGpus = huggingface::hardware::cuda::GetNumDevices(); if (numGpus.has_value()) { SPDLOG_INFO("Detected {:d} Nvidia GPU(s)", numGpus.value()); @@ -20,47 +43,49 @@ void huggingface::tgi::backends::InitializeBackend() { } } +[[nodiscard]] +tle::ParallelConfig +huggingface::tgi::backends::GetParallelConfig(const size_t worldSize, const std::string workerPath) noexcept { + auto mode = tle::CommunicationMode::kLEADER; + std::optional orchestratorConfig = std::nullopt; + + if (worldSize > 1) { + SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); + mode = tle::CommunicationMode::kORCHESTRATOR; + orchestratorConfig = std::make_optional(true, workerPath, nullptr, true); + } else { + SPDLOG_INFO("Detected single engine deployment, using leader mode"); + } + + return tle::ParallelConfig(tle::CommunicationType::kMPI, mode, std::nullopt, std::nullopt, orchestratorConfig); +} + [[nodiscard]] tle::ExecutorConfig huggingface::tgi::backends::GetExecutorConfig(const json &config, const std::string &workerPath) { - tle::ExecutorConfig execConfig(1); + tle::ExecutorConfig execConfig(/* maxBeamWidth = */ 1); // Retrieve the compute capabilities to enable some options at runtime const auto computeCapabilities = huggingface::hardware::cuda::GetCudaComputeCapabilities(); // Single engine (TP = PP = 1) -> using leader mode (no MPI involved) - if (config["/pretrained_config/mapping/world_size"_json_pointer].get() == 1) { - SPDLOG_INFO("Detected single engine deployment, using leader mode"); - execConfig.setParallelConfig(tle::ParallelConfig( - tle::CommunicationType::kMPI, - tle::CommunicationMode::kLEADER, - std::nullopt, - std::nullopt, - std::nullopt - )); - } else { // Multiple engines -> using orchestrator mode (MPI involved) - SPDLOG_INFO("Detected sharded engine deployment, using orchestrator mode"); - execConfig.setParallelConfig(tle::ParallelConfig( - tle::CommunicationType::kMPI, - tle::CommunicationMode::kORCHESTRATOR, - std::nullopt, - std::nullopt, - tle::OrchestratorConfig(true, workerPath, nullptr, true) - )); - } + const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get(); + execConfig.setParallelConfig(GetParallelConfig(worldSize, workerPath)); // Define some configuration variables execConfig.setKvCacheConfig(tle::KvCacheConfig(true)); - execConfig.setEnableChunkedContext(computeCapabilities.isPostAmpere()); + execConfig.setEnableChunkedContext(computeCapabilities.IsPostAmpere()); + execConfig.setSchedulerConfig(tle::SchedulerConfig(tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)); return execConfig; } tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig( - uint32_t topK, - float_t topP, - float_t temperature, - float_t repetition_penalty, - float_t frequency_penalty, - uint64_t seed) { + const uint32_t topK, + const float_t topP, + const float_t temperature, + const float_t repetition_penalty, + const float_t frequency_penalty, + const uint64_t seed) noexcept { + return tle::SamplingConfig( 1, // TGI only use a single beam topK, @@ -78,69 +103,101 @@ tle::SamplingConfig huggingface::tgi::backends::GetSamplingConfig( ); } +std::optional>> +huggingface::tgi::backends::GetStopWordsFromConfig( + const std::filesystem::path &generationConfigPath) noexcept { + if (exists(generationConfigPath)) { + const auto generationConfig = json::parse(std::ifstream(generationConfigPath)); + if (const auto eosTokenIds = generationConfig["/eos_token_id"_json_pointer]; eosTokenIds.is_array()) { + SPDLOG_INFO(FMT_STRING("Found {:d} EOS tokens"), eosTokenIds.size()); + std::list> stopWords(eosTokenIds.size()); + + const auto to_single_token = [](const auto tokenIdObj) -> decltype(stopWords)::value_type { + return {tokenIdObj.template get()}; + }; + + std::transform(eosTokenIds.cbegin(), eosTokenIds.cend(), stopWords.begin(), to_single_token); + return stopWords; + } else { + SPDLOG_INFO("Invalid EOS tokens entry found (not an array)"); + } + } else { + SPDLOG_INFO("No EOS tokens found, generation_config.json doesn't exist"); + } + + return std::nullopt; +} + huggingface::tgi::backends::TensorRtLlmBackend::TensorRtLlmBackend( const std::filesystem::path &enginesFolder, const std::filesystem::path &executorWorker ) : config(json::parse(std::ifstream(enginesFolder / "config.json"))), - executor( - enginesFolder, - tensorrt_llm::executor::ModelType::kDECODER_ONLY, - GetExecutorConfig(config, executorWorker.string() - )) { - SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get_ref()); -} + executor(enginesFolder, tensorrt_llm::executor::ModelType::kDECODER_ONLY, + GetExecutorConfig(config, executorWorker.string())) { + + SPDLOG_INFO(FMT_STRING("Engine (version={})"), config["/version"_json_pointer].get()); + + // Ensure we have enough GPUs on the system + const auto worldSize = config["/pretrained_config/mapping/world_size"_json_pointer].get(); + const auto numGpus = huggingface::hardware::cuda::GetNumDevices().value_or(0); + if (numGpus < worldSize) { + SPDLOG_CRITICAL(FMT_NOT_ENOUGH_GPUS, numGpus, worldSize); + // todo : raise exception to catch on rust side + } + + // Cache variables + maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); -bool huggingface::tgi::backends::TensorRtLlmBackend::IsReady() const { - return executor.canEnqueueRequests(); + // Attempt to discover stopWords from the generation_config.json + const auto generationConfigPath = enginesFolder / "generation_config.json"; + stopWords = GetStopWordsFromConfig(generationConfigPath).value_or(std::list>()); } [[nodiscard("Returned number of requests needs to be consumed")]] size_t huggingface::tgi::backends::TensorRtLlmBackend::NumResponsesReady() const { +#ifdef NDEBUG return executor.getNumResponsesReady(); +#else + const auto numResponses = executor.getNumResponsesReady(); + if (numResponses > 0) SPDLOG_INFO(FMT_STRING("Num responses ready: {:d}"), numResponses); + return numResponses; +#endif } [[nodiscard("Returned request id needs to be provided back to gather generated tokens")]] tle::IdType huggingface::tgi::backends::TensorRtLlmBackend::Submit( const std::vector &tokens, + const uint32_t maxNewTokens, const int32_t topK, const float_t topP, const float_t temperature, - const float_t repetition_penalty, - const float_t frequency_penalty, + const float_t repetitionPenalty, + const float_t frequencyPenalty, const uint64_t seed ) { -#ifdef NDEBUG - SPDLOG_DEBUG( - FMT_STRING("Submitting inference over {:d} tokens to the executor ({:d} already in-flight)"), - tokens.size(), - executor.getLatestIterationStats().back().numActiveRequests - ); -#else - SPDLOG_DEBUG( - FMT_STRING("Submitting inference [{}] to the executor ({:d} already in-flight)"), - fmt::join(tokens, ", "), - executor.getLatestIterationStats().front().numActiveRequests - ); + const auto maxNewTokensChecked = std::min(maxNewTokens, static_cast(maxNumTokens - tokens.size())); +#ifndef NDEBUG + { + const auto &iterations = executor.getLatestIterationStats(); + const auto &lastIteration = iterations.front(); + + SPDLOG_DEBUG(FMT_EXECUTOR_STATS, fmt::join(tokens, ", "), lastIteration.numActiveRequests); + SPDLOG_DEBUG(FMT_SAMPLING_CONFIG, topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed); + SPDLOG_DEBUG(FMT_STRING("Asking for max_new_tokens={:d}"), maxNewTokensChecked); + } #endif - const auto maxNumTokens = config["/build_config/max_num_tokens"_json_pointer].get(); - const auto maxNewTokens = static_cast(std::max(1ul, maxNumTokens - tokens.size())); + const auto sampling = GetSamplingConfig(topK, topP, temperature, repetitionPenalty, frequencyPenalty, seed); - const auto sampling = GetSamplingConfig(topK, topP, temperature, repetition_penalty, frequency_penalty, seed); - const auto output = tle::OutputConfig(true, false, false, true, false); - return executor.enqueueRequest( - tle::Request{tokens, maxNewTokens, true, sampling, output}); -} + // Build the request + auto request = tle::Request{tokens, CAST_SIZETYPE(maxNewTokensChecked), true, sampling, OUTPUT_CONFIG}; + request.setStopWords(stopWords); -[[nodiscard("Generated tokens result must be used")]] -std::vector huggingface::tgi::backends::TensorRtLlmBackend::Poll(const tle::IdType requestId) { - SPDLOG_DEBUG(FMT_STRING("Polling status for request {:d}"), requestId); - return executor.awaitResponses(requestId); + // Submit to the executor for batching + return executor.enqueueRequest(request); } - -void huggingface::tgi::backends::TensorRtLlmBackend::Shutdown() { - SPDLOG_INFO("Shutting down executor"); - executor.shutdown(); +std::vector huggingface::tgi::backends::TensorRtLlmBackend::PullNewTokens() { + return executor.awaitResponses(); } diff --git a/backends/trtllm/scripts/install_tensorrt.sh b/backends/trtllm/scripts/install_tensorrt.sh index e0e2dd17b45..4c2dc26b6bf 100755 --- a/backends/trtllm/scripts/install_tensorrt.sh +++ b/backends/trtllm/scripts/install_tensorrt.sh @@ -2,12 +2,13 @@ set -ex -TRT_VER="10.2.0.19" -CUDA_VER="12.5" -CUDNN_VER="9.2.1.18-1" -NCCL_VER="2.22.3-1+cuda12.5" -CUBLAS_VER="12.5.3.2-1" -NVRTC_VER="12.5.82-1" +TRT_VER_BASE="10.4.0" +TRT_VER_FULL="${TRT_VER_BASE}.26" +CUDA_VER="12.6" +CUDNN_VER="9.5.0.50-1" +NCCL_VER="2.22.3-1+cuda12.6" +CUBLAS_VER="12.6.3.3-1" +NVRTC_VER="12.6.77-1" for i in "$@"; do case $i in @@ -32,8 +33,9 @@ install_ubuntu_requirements() { ARCH=$(uname -m) if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi - curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/${ARCH}/cuda-keyring_1.0-1_all.deb - dpkg -i cuda-keyring_1.0-1_all.deb + curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb + dpkg -i cuda-keyring_1.1-1_all.deb + rm /etc/apt/sources.list.d/cuda-ubuntu2404-x86_64.list apt-get update if [[ $(apt list --installed | grep libcudnn9) ]]; then @@ -71,7 +73,7 @@ install_centos_requirements() { install_tensorrt() { #PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))') #PARSED_PY_VERSION=$(echo "${PY_VERSION//./}") - TRT_CUDA_VERSION="12.5" + TRT_CUDA_VERSION="12.6" if [ -z "$RELEASE_URL_TRT" ];then ARCH=${TRT_TARGETARCH} @@ -79,12 +81,12 @@ install_tensorrt() { if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi - if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi - RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz + if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi + RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_BASE}/tars/TensorRT-${TRT_VER_FULL}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz fi wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar tar -xf /tmp/TensorRT.tar -C /usr/local/ - mv /usr/local/TensorRT-${TRT_VER} /usr/local/tensorrt + mv /usr/local/TensorRT-${TRT_VER_FULL} /usr/local/tensorrt # pip3 install /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl rm -rf /tmp/TensorRT.tar } diff --git a/backends/trtllm/src/backend.rs b/backends/trtllm/src/backend.rs deleted file mode 100644 index b23aa6c01fe..00000000000 --- a/backends/trtllm/src/backend.rs +++ /dev/null @@ -1,330 +0,0 @@ -use std::future::Future; -use std::path::Path; -use std::pin::{pin, Pin}; -use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::{Arc, OnceLock}; -use std::task::{Context, Poll}; -use std::time::Duration; - -use async_trait::async_trait; -use cxx::UniquePtr; -use log::{error, warn}; -use tokenizers::Tokenizer; -use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; -use tokio::time::{sleep, Instant}; -use tokio_stream::wrappers::UnboundedReceiverStream; -use tokio_stream::{Stream, StreamExt}; -use tracing::{instrument, span, Level}; - -// use tokio::sync::RwLock; -use parking_lot::RwLock; -use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; -use text_generation_router::validation::ValidationError::UnsupportedModality; -use text_generation_router::validation::{Chunk, ValidGenerateRequest, ValidationError}; -use text_generation_router::{FinishReason, Token}; - -use crate::errors::TensorRtLlmBackendError; -use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; - -// Value used to poll the state of the generation stream -static POLLING_INTERVAL_US: OnceLock = OnceLock::new(); - -type InferResult = Result; - -pub(crate) struct Generation { - executor: Arc>>, - done: Arc, -} - -/// Holds the user provided input to be executed along with a channel allowing -/// to bubble up all the generated tokens for that tokens the to end stream. -pub struct GenerationContext { - sender: UnboundedSender>, - tokenizer: Arc, - tokens: Vec, - done: Arc, - queued: Instant, - start: Option, -} - -impl Stream for Generation { - type Item = usize; - - fn poll_next(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll> { - let interval = POLLING_INTERVAL_US.get_or_init(|| { - u64::from_str(option_env!("TRTLLM_BACKEND_POLLING_INTERVAL_US").unwrap_or("100")) - .expect("Invalid value provided for envvar POLLING_INTERVAL_US") - }); - - if !self.done.load(Ordering::Relaxed) { - let backend = pin!(self.executor.read()); - let status = match backend.poll(ctx) { - Poll::Ready(executor_r) => { - let ready = executor_r.num_responses_ready(); - if ready == 0 { - Poll::Pending - } else { - Poll::Ready(Some(ready)) - } - } - Poll::Pending => Poll::Pending, - }; - - let waker = ctx.waker().clone(); - tokio::spawn(async { - sleep(Duration::from_micros(*interval)).await; - waker.wake(); - }); - - status - } else { - Poll::Ready(None) // end of stream - } - } - - fn size_hint(&self) -> (usize, Option) { - (1, None) - } -} - -unsafe impl Send for TensorRtLlmBackendImpl {} -unsafe impl Sync for TensorRtLlmBackendImpl {} - -/// Implements the logic to execute generation with TensorRT-LLM executor API in background -pub struct TensorRtLlmBackend { - tokenizer: Arc, - - // Backing the backend behind a RwLock to allow concurrent read access to retrieve - // the number of available tokens (read only) in the Generation stream - backend: Arc>>, -} - -impl TensorRtLlmBackend { - pub fn new + Send + 'static, PP: AsRef + Send + 'static>( - tokenizer: Tokenizer, - engine_folder: P, - executor_worker_path: PP, - ) -> Result { - Ok(TensorRtLlmBackend { - tokenizer: Arc::new(tokenizer), - backend: Arc::new(RwLock::new(create_tensorrt_llm_backend( - engine_folder.as_ref().to_str().unwrap(), - executor_worker_path.as_ref().to_str().unwrap(), - ))), - }) - } - - fn validate(request: &ValidGenerateRequest) -> InferResult<&String> { - if request.top_n_tokens > 1 { - return Err(InferError::ValidationError( - ValidationError::TopNTokensDisabled, - )); - } - - // TODO: Is it really needed? How can it be validated before? - if request.parameters.grammar.is_some() { - return Err(InferError::ValidationError(ValidationError::Grammar)); - } - - match request.inputs.len() { - 0 => Err(InferError::ValidationError(ValidationError::EmptyInput)), - 2.. => Err(InferError::GenerationError( - "TensorRT-LLM backend don't support multi-chunk".into(), - )), - 1 => match request.inputs.first().expect("Single item-chunk") { - Chunk::Text(text) => Ok(text), - Chunk::Image(_) => Err(InferError::ValidationError(UnsupportedModality("image"))), - }, - } - } - - fn generate( - &self, - sender: UnboundedSender>, - tokens: Vec, - top_k: u32, - top_p: f32, - temperature: f32, - repetition_penalty: f32, - frequency_penalty: f32, - seed: u64, - ) { - let tokenizer = Arc::clone(&self.tokenizer); - let executor = Arc::clone(&self.backend); - - // Let's push this in async context - tokio::spawn(async move { - // Define the generation state - let mut generation = Generation { - executor: executor.clone(), - done: Arc::new(AtomicBool::new(false)), - }; - - // Define the context over the generation - // TODO(asap): Do we really need so many shared-ownership? - let ctx = Box::new(GenerationContext { - sender: sender.clone(), - tokenizer, - tokens: vec![], - done: Arc::clone(&generation.done), - start: None, - queued: Instant::now(), - }); - - // We are leaking the context on-purpose to avoid the box being dropped while there are - // still computation ongoing - // TODO(asap): Can we achieve the same with an Arc> without the need to go unsafe? - let ctx_ = Box::leak(ctx); - - // Submit the request to the batcher - let request_id = span!(Level::DEBUG, "submit") - .in_scope(|| async { - let mut handle = executor.write().await; - let request_id = handle.pin_mut().submit( - &tokens, - top_k as i32, - top_p, - temperature, - repetition_penalty, - frequency_penalty, - seed, - ); - - request_id - }) - .await; - - while let Some(_) = generation.next().await { - let mut executor_w = executor.write().await; - let executor = executor_w.pin_mut(); - - span!(Level::DEBUG, "decode") - .in_scope(|| async { - unsafe { - executor.stream_tokens( - request_id, - ctx_, - |ctx: *mut GenerationContext, step: GenerationStep| { - let inner_ctx = &mut *ctx; - - // Update the timestamp at which the request started effectively - // Can be a bit off, would need to be before the callback, let's see - inner_ctx.start.get_or_insert(Instant::now()); - inner_ctx.done.store(step.is_final, Ordering::Relaxed); - - // Ensure we are not running into errors - let parcel = if !step.has_error { - // Insert the latest generated token to the tracker - inner_ctx.tokens.push(step.token_id); - - // Decode the token - let text = inner_ctx - .tokenizer - .decode(&[step.token_id], true) - .expect("Failed to decode token"); - - let special = inner_ctx - .tokenizer - .get_added_vocabulary() - .is_special_token(&text); - - // Create the structure holding the token - let token = Token { - id: step.token_id, - text, - logprob: step.log_prob, - special, - }; - - if step.is_final { - let generated_text = inner_ctx - .tokenizer - .decode(&inner_ctx.tokens, true) - .expect("Failed to decode generated_tokens"); - - Ok(InferStreamResponse::End { - token, - top_tokens: vec![], - generated_text: GeneratedText { - text: generated_text, - generated_tokens: inner_ctx.tokens.len() as u32, - finish_reason: FinishReason::EndOfSequenceToken, - seed: None, - }, - start: inner_ctx.start.unwrap_or(Instant::now()), - queued: inner_ctx.queued, - }) - } else { - Ok(InferStreamResponse::Intermediate { - token, - top_tokens: vec![], - }) - } - } else { - error!("Error caught while decoding: {}", &step.error_msg); - Err(InferError::GenerationError(step.error_msg)) - }; - - // Send the parcel to the client - inner_ctx - .sender - .send(parcel) - .expect("Failed to sent msg through the channel"); - }, - ); - } - }) - .await; - } - - // "Properly" free the shared context... - // TODO: clean that piece of sh** asap - unsafe { - let _ = Box::from_raw(ctx_); - } - }); - } -} - -#[async_trait] -impl Backend for TensorRtLlmBackend { - #[instrument(skip_all)] - fn schedule( - &self, - request: ValidGenerateRequest, - ) -> InferResult>> { - // Let's add a few more validation - let input = TensorRtLlmBackend::validate(&request)?; - - // Channel to stream the generated token as they come from the worker thread back to the transport layer - let (sender, receiver) = unbounded_channel(); - - // Unpack parameters - let params = &request.parameters; - - // Preprocess the inputs to send to TRTLLM backend - let encoding = self - .tokenizer - .encode(input.as_str(), true) - .map_err(|e| InferError::GenerationError(e.to_string()))?; - - // Generate the response - self.generate( - sender, - Vec::from(encoding.get_ids()), - params.top_k, - params.top_p, - params.temperature, - params.repetition_penalty, - params.frequency_penalty, - params.seed, - ); - - Ok(UnboundedReceiverStream::new(receiver)) - } - - async fn health(&self, _current_health: bool) -> bool { - true - } -} diff --git a/backends/trtllm/src/errors.rs b/backends/trtllm/src/errors.rs index a672d2a406b..812fd6e30d8 100644 --- a/backends/trtllm/src/errors.rs +++ b/backends/trtllm/src/errors.rs @@ -1,9 +1,16 @@ +use std::path::PathBuf; use thiserror::Error; use text_generation_router::server; #[derive(Debug, Error)] pub enum TensorRtLlmBackendError { + #[error("Provided engine folder {0} doesn't exist")] + EngineFolderDoesntExists(PathBuf), + #[error("Provided executorWorker binary path {0} doesn't exist")] + ExecutorWorkerNotFound(PathBuf), + #[error("TensorRT-LLM Runtime error: {0}")] + Runtime(String), #[error("Tokenizer error: {0}")] Tokenizer(String), #[error("Argument validation error: {0}")] diff --git a/backends/trtllm/src/ffi.cpp b/backends/trtllm/src/ffi.cpp index d6317a68c89..0a92c050f65 100644 --- a/backends/trtllm/src/ffi.cpp +++ b/backends/trtllm/src/ffi.cpp @@ -3,11 +3,13 @@ // #pragma once -#include +#include #include #include +#include #include #include +#include #include #include @@ -20,61 +22,64 @@ huggingface::tgi::backends::TensorRtLlmBackendImpl::TensorRtLlmBackendImpl( ) : TensorRtLlmBackend(engineFolder, executorWorker) {} -bool huggingface::tgi::backends::TensorRtLlmBackendImpl::IsReady() const { - return TensorRtLlmBackend::IsReady(); -} - uint64_t huggingface::tgi::backends::TensorRtLlmBackendImpl::Submit( - rust::Slice tokens, int32_t topK, float_t topP, float_t temperature, float_t repetition_penalty, - float_t frequency_penalty, uint64_t seed) { + rust::Slice tokens, + uint32_t maxNewTokens, + int32_t topK, + float_t topP, + float_t temperature, + float_t repetition_penalty, + float_t frequency_penalty, + uint64_t seed) { // This will copy all the items from the initial slice - std::vector tokens_(std::make_move_iterator(tokens.begin()), std::make_move_iterator(tokens.end())); + std::vector tokens_(tokens.begin(), tokens.end()); return TensorRtLlmBackend::Submit( - std::move(tokens_), topK, topP, temperature, repetition_penalty, frequency_penalty, seed); + std::move(tokens_), maxNewTokens, topK, topP, temperature, repetition_penalty, frequency_penalty, seed); } -size_t huggingface::tgi::backends::TensorRtLlmBackendImpl::StreamTokens( - const uint64_t requestId, - huggingface::tgi::backends::GenerationContext *ctx, - rust::Fn callback) { - - size_t numTokens = 0; - for (const auto &item: Poll(requestId)) { - GenerationStep step; - if (!item.hasError()) { - SPDLOG_DEBUG("\tStreamTokens -> Decoding token..."); - const auto decoded = item.getResult(); - - const auto token = decoded.outputTokenIds[0][0]; - const auto isFinal = decoded.isFinal; - const auto logProb = decoded.logProbs.value()[0][0]; - - ++numTokens; - - SPDLOG_DEBUG(FMT_STRING("\tStreamTokens -> {:d} {:.2f} (final = {})"), token, logProb, isFinal); - step = huggingface::tgi::backends::GenerationStep{ - static_cast(token), logProb, isFinal, false, std::move(std::string()) +std::unique_ptr> +huggingface::tgi::backends::TensorRtLlmBackendImpl::PullTokens() { + const auto responses = TensorRtLlmBackend::PullNewTokens(); + + auto steps = std::make_unique>(); + steps->reserve(responses.size()); + +#ifndef NDEBUG + SPDLOG_DEBUG(FMT_STRING("Pulled out {:d} new tokens"), responses->size()); +#endif + + // Transform tle::Response to GenerationStep + std::ranges::transform(responses.begin(), responses.end(), std::back_inserter(*steps), [](const tle::Response &r) { + const auto reqId = r.getRequestId(); + if (!r.hasError()) { + const auto result = r.getResult(); + return GenerationStep{ + reqId, + static_cast(result.outputTokenIds[0][0]), + result.logProbs.value()[0][0], + result.isFinal, + false, + std::string() }; - SPDLOG_DEBUG("\tStreamTokens -> Post callback"); } else { - // TODO : Return rest::Result with error - const auto what = item.getErrorMsg(); - SPDLOG_WARN("\tStreamTokens -> Got error while decoding: {}", what); - step = huggingface::tgi::backends::GenerationStep{ - std::numeric_limits::max(), 0.0, true, true, std::move(what) + return GenerationStep{ + reqId, + 0, + 0.0, + true, + true, + std::move(r.getErrorMsg()) }; } + }); - callback(std::move(ctx), std::move(step)); - } - - return numTokens; + return steps; } std::unique_ptr huggingface::tgi::backends::CreateTensorRtLlmBackend(rust::Str engineFolder, rust::Str executorWorker) { + SPDLOG_INFO("Creating TensorRT-LLM Backend"); // Unconditionally call this to initialize and discover TRTLLM plugins InitializeBackend(); diff --git a/backends/trtllm/src/lib.rs b/backends/trtllm/src/lib.rs index 1a804f88973..edd8caff154 100644 --- a/backends/trtllm/src/lib.rs +++ b/backends/trtllm/src/lib.rs @@ -1,14 +1,16 @@ -pub use backend::{GenerationContext, TensorRtLlmBackend}; +pub use looper::TensorRtLlmBackendV2; -mod backend; pub mod errors; +mod looper; +mod utils; #[cxx::bridge(namespace = "huggingface::tgi::backends")] mod ffi { - /// Struct used as shared type between rust and C++ to represent the result /// of a single decoding iteration + #[derive(Debug, Clone)] pub struct GenerationStep { + request_id: u64, token_id: u32, log_prob: f32, is_final: bool, @@ -16,10 +18,6 @@ mod ffi { error_msg: String, } - extern "Rust" { - type GenerationContext; - } - unsafe extern "C++" { include!("backends/trtllm/src/ffi.cpp"); @@ -44,10 +42,7 @@ mod ffi { fn CreateTensorRtLlmBackend( engine_folder: &str, executor_worker: &str, - ) -> UniquePtr; - - // #[rust_name = "is_ready"] - // fn IsReady(self: &TensorRtLlmBackendImpl) -> bool; + ) -> Result>; #[rust_name = "num_responses_ready"] fn NumResponsesReady(self: &TensorRtLlmBackendImpl) -> usize; @@ -56,23 +51,18 @@ mod ffi { fn Submit( self: Pin<&mut TensorRtLlmBackendImpl>, tokens: &[u32], + max_new_tokens: u32, top_k: i32, top_p: f32, temperature: f32, repetition_penalty: f32, frequency_penalty: f32, seed: u64, - ) -> u64; + ) -> Result; - #[rust_name = "stream_tokens"] - unsafe fn StreamTokens( + #[rust_name = "pull_tokens"] + fn PullTokens( self: Pin<&mut TensorRtLlmBackendImpl>, - request_id: u64, - ctx: *mut GenerationContext, - cb: unsafe fn(*mut GenerationContext, GenerationStep), - ) -> usize; - - // #[rust_name = "shutdown"] - // fn Shutdown(self: Pin<&mut TensorRtLlmBackendImpl>); + ) -> Result>>; } } diff --git a/backends/trtllm/src/looper.rs b/backends/trtllm/src/looper.rs new file mode 100644 index 00000000000..e26155c163c --- /dev/null +++ b/backends/trtllm/src/looper.rs @@ -0,0 +1,382 @@ +use std::hint; +use std::ops::Deref; +use std::path::Path; + +use async_trait::async_trait; +use cxx::UniquePtr; +use hashbrown::HashMap; +use tokenizers::Tokenizer; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tokio::sync::TryAcquireError; +use tokio::task::{spawn_blocking, JoinHandle}; +use tokio::time::Instant; +use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::{debug, error, warn}; + +use text_generation_router::infer::InferError::{GenerationError, ValidationError}; +use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; +use text_generation_router::validation::ValidationError::{ + EmptyInput, Grammar, TopNTokensDisabled, UnsupportedModality, +}; +use text_generation_router::validation::{Chunk, ValidGenerateRequest}; +use text_generation_router::{FinishReason, Token}; + +use crate::errors::TensorRtLlmBackendError; +use crate::ffi::{create_tensorrt_llm_backend, GenerationStep, TensorRtLlmBackendImpl}; +use crate::utils::first_line; + +type InferResult = Result; + +/// Wrap the requests along with the channel used to stream back to the client the decoded tokens +struct GenerationContext { + request: ValidGenerateRequest, + start: Option, + queued: Instant, + streamer: UnboundedSender>, +} + +#[derive(Debug, Copy, Clone)] +struct DecodedToken { + id: u32, + log_prob: f32, + is_final: bool, +} + +impl<'step> TryFrom<&'step GenerationStep> for DecodedToken { + type Error = InferError; + + fn try_from(step: &'step GenerationStep) -> Result { + if !step.has_error { + Ok(Self { + id: step.token_id, + log_prob: step.log_prob, + is_final: step.is_final, + }) + } else { + Err(GenerationError(step.error_msg.clone())) + } + } +} + +/// Wraps the decoded token with the channel used to stream back to the client the decoded tokens +struct DecodedTokenContext { + token: DecodedToken, + start: Option, + queued: Instant, + channel: UnboundedSender>, +} + +fn executor_status_looper( + mut backend: UniquePtr, + max_inflight_requests: usize, + mut waiting_requests: UnboundedReceiver, + post_processor_sender: UnboundedSender<(u64, InferResult)>, +) { + // Track the tuple (request_id, stream) for each request + let mut in_flights = + HashMap::::with_capacity(max_inflight_requests * 2); + + // TODO: Does it need a spin-loop? + 'scheduler: loop { + // Is there any request pending to be scheduled? + let awaiting_requests = waiting_requests.len(); + for _ in 0..awaiting_requests { + // Retrieve all the requests + if let Some(mut ctx) = waiting_requests.blocking_recv() { + // Submit all the request to the executor and move the context to the in-flight tracker + let request = &ctx.request; + let generation_params = &request.parameters; + let stopping_params = &request.stopping_parameters; + let input_ids = request.input_ids.as_deref(); + + // Submit to the TensorRT-LLM executor for scheduling + match backend.pin_mut().submit( + &input_ids.unwrap(), // This is checked beforehand in validate() + stopping_params.max_new_tokens, + generation_params.top_k as i32, + generation_params.top_p, + generation_params.temperature, + generation_params.repetition_penalty, + generation_params.frequency_penalty, + generation_params.seed, + ) { + Ok(request_id) => { + // Insert the context linked to the generated request id in the tracker + debug!("[in-flight] Added {}", request_id); + ctx.start = Some(Instant::now()); + in_flights.insert(request_id, ctx); + } + Err(e) => { + // Return to the caller + let what = e.to_string(); + error!(error = what.as_str(), "Failed to schedule request"); + + let err = Err(InferError::Overloaded(TryAcquireError::NoPermits)); + if let Err(_) = ctx.streamer.send(err) { + error!("Failed to send back error to the client"); + } + } + }; + } + } + + if backend.num_responses_ready() > 0 { + match backend.pin_mut().pull_tokens() { + Ok(responses) => { + // Iterate through all the decoded token + for step in responses.deref() { + if let Some(ctx) = in_flights.get(&step.request_id) { + // Remove from tracked requests + let parcel = + DecodedToken::try_from(step).map(|dt| DecodedTokenContext { + token: dt, + start: ctx.start, + queued: ctx.queued, + channel: ctx.streamer.clone(), + }); + + // Submit the work to p:the post_processor + let posted = post_processor_sender.send((step.request_id, parcel)); + + if posted.is_err() || step.is_final { + debug!("Removing {}", step.request_id); + let _ = in_flights.remove(&step.request_id); + } + } else { + warn!("Untracked request {}", step.request_id,); + } + } + } + Err(ref err) => { + error!("Failed to get responses from the executor: {}.", err.what()); + break 'scheduler; + } + } + } + + // Hint the CPU we are spin-locking + hint::spin_loop(); + } +} + +fn post_processor_looper( + tokenizer: Tokenizer, + max_inflight_requests: usize, + mut decoded_tokens: UnboundedReceiver<(u64, InferResult)>, +) { + let mut states: HashMap> = HashMap::with_capacity(max_inflight_requests * 2); + + 'post_processor: loop { + if decoded_tokens.is_closed() { + warn!("Post processor IPC is closed, loop will exit now."); + break 'post_processor; + } + + if let Some((request_id, decoded)) = decoded_tokens.blocking_recv() { + match decoded { + Ok(ctx) => { + states + .entry(request_id) + .and_modify(|s| s.push(*&ctx.token.id)) + .or_insert_with(|| { + let mut state = Vec::with_capacity(MAX_NUM_TOKENS); + state.push(*&ctx.token.id); + state + }); + + let out = match tokenizer.decode(&[ctx.token.id], false) { + Ok(text) => { + let is_special = + tokenizer.get_added_vocabulary().is_special_token(&text); + let token = Token { + id: ctx.token.id, + text, + logprob: ctx.token.log_prob, + special: is_special, + }; + + let out = if !ctx.token.is_final { + InferStreamResponse::Intermediate { + token, + top_tokens: vec![], + } + } else { + let tokens = states.remove(&request_id).unwrap(); + let text = tokenizer.decode(&tokens, true); + let generated_text = GeneratedText { + text: text.unwrap(), + generated_tokens: tokens.len() as u32, + finish_reason: FinishReason::EndOfSequenceToken, + seed: None, + }; + + InferStreamResponse::End { + token, + top_tokens: vec![], + generated_text, + start: ctx.start.unwrap(), + queued: ctx.queued, + } + }; + + Ok(out) + } + Err(err) => Err(GenerationError(err.to_string())), + }; + + if let Err(_) = ctx.channel.send(out) { + warn!("Failed to send decoded token back to the user") + } + } + Err(_err) => { + todo!("what do we do?") + } + } + } + } +} + +fn ensure_paths_exist, PP: AsRef>( + engine_folder: P, + executor_worker_path: PP, +) -> Result<(String, String), TensorRtLlmBackendError> { + // Retrieve paths as &str for the backend creation + let engine_folder = engine_folder.as_ref(); + let executor_worker_path = executor_worker_path.as_ref(); + + // Ensure the engine folder exists + if !engine_folder.exists() { + let err = TensorRtLlmBackendError::EngineFolderDoesntExists(engine_folder.to_path_buf()); + + error!("Path validation failed: {}", err,); + return Err(err); + } + + // Ensure executor worker binary exists + if !executor_worker_path.exists() { + let err = TensorRtLlmBackendError::ExecutorWorkerNotFound(engine_folder.to_path_buf()); + + error!("Path validation failed: {}", err,); + return Err(err); + } + + let engine_folder = String::from( + engine_folder + .to_str() + .expect("Failed to convert engine_folder to valid UTF-8"), + ); + + let executor_worker_path = String::from( + executor_worker_path + .to_str() + .expect("Failed to convert executor_worker_path to valid UTF-8"), + ); + + Ok((engine_folder, executor_worker_path)) +} + +unsafe impl Send for TensorRtLlmBackendImpl {} + +pub struct TensorRtLlmBackendV2 { + executor_looper: JoinHandle<()>, + post_processor_looper: JoinHandle<()>, + executor: UnboundedSender, +} + +impl TensorRtLlmBackendV2 { + pub fn new + Send, PP: AsRef + Send>( + tokenizer: Tokenizer, + engine_folder: P, + executor_worker_path: PP, + max_inflight_requests: usize, + ) -> Result { + let (engine_folder, executor_worker_path) = + ensure_paths_exist(engine_folder, executor_worker_path)?; + + // Allocate the IPC layer to communicate with the backend + let (executor_sender, executor_receiver) = unbounded_channel(); + let (post_processor_sender, post_processor_receiver) = unbounded_channel(); + + // Create the FFI backend + let backend = create_tensorrt_llm_backend(&engine_folder, &executor_worker_path) + .map_err(|e| TensorRtLlmBackendError::Runtime(first_line(e.what(), "Unknown error")))?; + + // Executor looper is responsible for scheduling and pulling requests state at regular interval + let executor_looper = spawn_blocking(move || { + executor_status_looper( + backend, + max_inflight_requests, + executor_receiver, + post_processor_sender, + ) + }); + + // Post processor looper is responsible from receiving a bunch of tokens, decoding them and sending them back to the user + let post_processor_looper = spawn_blocking(move || { + post_processor_looper::<256>(tokenizer, max_inflight_requests, post_processor_receiver) + }); + + Ok(TensorRtLlmBackendV2 { + executor_looper, + post_processor_looper, + executor: executor_sender, + }) + } + + fn validate(request: &ValidGenerateRequest) -> InferResult<()> { + if request.input_ids.is_none() { + return Err(ValidationError(UnsupportedModality("No token provided"))); + } + + if request.top_n_tokens > 1 { + return Err(ValidationError(TopNTokensDisabled)); + } + + // TODO: Is it really needed? How can it be validated before? + if request.parameters.grammar.is_some() { + return Err(ValidationError(Grammar)); + } + + match request.inputs.len() { + 0 => Err(ValidationError(EmptyInput)), + 2.. => Err(GenerationError( + "TensorRT-LLM backend don't support multi-chunk".into(), + )), + 1 => match request.inputs.first().expect("Single item-chunk") { + Chunk::Text(_) => Ok(()), + Chunk::Image(_) => Err(ValidationError(UnsupportedModality("image"))), + }, + } + } +} + +#[async_trait] +impl Backend for TensorRtLlmBackendV2 { + fn schedule( + &self, + inner: ValidGenerateRequest, + ) -> Result>, InferError> { + Self::validate(&inner)?; + + // Open-up the stream to send tokens + let (streamer, receiver) = unbounded_channel::>(); + + // Send the context to the executor for scheduling + let queued = Instant::now(); + match self.executor.send(GenerationContext { + request: inner, + start: None, + queued, + streamer, + }) { + Ok(_) => Ok(UnboundedReceiverStream::new(receiver)), + Err(_) => Err(GenerationError( + "Failed to submit request to the backend".into(), + )), + } + } + + async fn health(&self, _: bool) -> bool { + !self.executor_looper.is_finished() & !self.post_processor_looper.is_finished() + } +} diff --git a/backends/trtllm/src/main.rs b/backends/trtllm/src/main.rs index e0ba46c7955..6a247fc1d52 100644 --- a/backends/trtllm/src/main.rs +++ b/backends/trtllm/src/main.rs @@ -1,10 +1,16 @@ +use std::path::{Path, PathBuf}; + use clap::Parser; -use std::collections::HashMap; -use std::path::PathBuf; +use hf_hub::api::tokio::{Api, ApiBuilder}; +use hf_hub::{Cache, Repo, RepoType}; +use tokenizers::Tokenizer; +use tracing::info; + use text_generation_backends_trtllm::errors::TensorRtLlmBackendError; -use text_generation_backends_trtllm::TensorRtLlmBackend; -use text_generation_router::server; -use tokenizers::{FromPretrainedParameters, Tokenizer}; +use text_generation_backends_trtllm::TensorRtLlmBackendV2; +use text_generation_router::server::get_base_tokenizer; +use text_generation_router::usage_stats::UsageStatsLevel; +use text_generation_router::{server, HubTokenizerConfig}; /// App Configuration #[derive(Parser, Debug)] @@ -48,14 +54,138 @@ struct Args { otlp_service_name: String, #[clap(long, env)] cors_allow_origin: Option>, - #[clap(long, env, default_value_t = false)] - messages_api_enabled: bool, #[clap(default_value = "4", long, env)] max_client_batch_size: usize, #[clap(long, env)] auth_token: Option, #[clap(long, env, help = "Path to the TensorRT-LLM Orchestrator worker")] executor_worker: PathBuf, + #[clap(default_value = "on", long, env)] + usage_stats: usage_stats::UsageStatsLevel, +} + +async fn get_tokenizer( + tokenizer_name: &str, + tokenizer_config_path: Option<&str>, + revision: Option<&str>, +) -> Option { + // Parse Huggingface hub token + let authorization_token = std::env::var("HF_TOKEN") + .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) + .ok(); + + // Tokenizer instance + let local_path = Path::new(tokenizer_name); + + // Shared API builder initialization + let api_builder = || { + let mut builder = ApiBuilder::new() + .with_progress(false) + .with_token(authorization_token); + + if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") { + builder = builder.with_cache_dir(cache_dir.into()); + } + + builder + }; + + // Decide if we need to use the API based on the revision and local path + let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir(); + + // Initialize API if needed + #[derive(Clone)] + enum Type { + Api(Api), + Cache(Cache), + None, + } + let api = if use_api { + if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) { + let cache = std::env::var("HUGGINGFACE_HUB_CACHE") + .map_err(|_| ()) + .map(|cache_dir| Cache::new(cache_dir.into())) + .unwrap_or_else(|_| Cache::default()); + tracing::warn!("Offline mode active using cache defaults"); + Type::Cache(cache) + } else { + tracing::info!("Using the Hugging Face API"); + match api_builder().build() { + Ok(api) => Type::Api(api), + Err(_) => { + tracing::warn!("Unable to build the Hugging Face API"); + Type::None + } + } + } + } else { + Type::None + }; + + // Load tokenizer and model info + let ( + tokenizer_filename, + _config_filename, + tokenizer_config_filename, + _preprocessor_config_filename, + _processor_config_filename, + ) = match api { + Type::None => ( + Some(local_path.join("tokenizer.json")), + Some(local_path.join("config.json")), + Some(local_path.join("tokenizer_config.json")), + Some(local_path.join("preprocessor_config.json")), + Some(local_path.join("processor_config.json")), + ), + Type::Api(api) => { + let api_repo = api.repo(Repo::with_revision( + tokenizer_name.to_string(), + RepoType::Model, + revision.unwrap_or_else(|| "main").to_string(), + )); + + let tokenizer_filename = match api_repo.get("tokenizer.json").await { + Ok(tokenizer_filename) => Some(tokenizer_filename), + Err(_) => get_base_tokenizer(&api, &api_repo).await, + }; + let config_filename = api_repo.get("config.json").await.ok(); + let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok(); + let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok(); + let processor_config_filename = api_repo.get("processor_config.json").await.ok(); + + ( + tokenizer_filename, + config_filename, + tokenizer_config_filename, + preprocessor_config_filename, + processor_config_filename, + ) + } + Type::Cache(cache) => { + let repo = cache.repo(Repo::with_revision( + tokenizer_name.to_string(), + RepoType::Model, + revision.clone().unwrap_or_else(|| "main").to_string(), + )); + ( + repo.get("tokenizer.json"), + repo.get("config.json"), + repo.get("tokenizer_config.json"), + repo.get("preprocessor_config.json"), + repo.get("processor_config.json"), + ) + } + }; + + // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'. + let tokenizer_config: Option = if let Some(filename) = tokenizer_config_path + { + HubTokenizerConfig::from_file(filename) + } else { + tokenizer_config_filename.and_then(HubTokenizerConfig::from_file) + }; + + tokenizer_filename.and_then(|filename| Tokenizer::from_file(filename).ok()) } #[tokio::main] @@ -83,10 +213,10 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { otlp_endpoint, otlp_service_name, cors_allow_origin, - messages_api_enabled, max_client_batch_size, auth_token, executor_worker, + usage_stats, } = args; // Launch Tokio runtime @@ -124,18 +254,26 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { ))); } - // Run server - let tokenizer = Tokenizer::from_pretrained( - tokenizer_name.clone(), - Some(FromPretrainedParameters { - revision: revision.clone().unwrap_or(String::from("main")), - user_agent: HashMap::new(), - auth_token, - }), + // Create the backend + let tokenizer = get_tokenizer( + &tokenizer_name, + tokenizer_config_path.as_deref(), + revision.as_deref(), ) - .map_err(|e| TensorRtLlmBackendError::Tokenizer(e.to_string()))?; + .await + .expect("Failed to retrieve tokenizer implementation"); + + info!("Successfully retrieved tokenizer {}", &tokenizer_name); + let backend = TensorRtLlmBackendV2::new( + tokenizer, + model_id, + executor_worker, + max_concurrent_requests, + )?; - let backend = TensorRtLlmBackend::new(tokenizer, model_id, executor_worker)?; + info!("Successfully created backend"); + + // Run server server::run( backend, max_concurrent_requests, @@ -145,7 +283,7 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { max_input_tokens, max_total_tokens, validation_workers, - None, + auth_token, tokenizer_name, tokenizer_config_path, revision, @@ -155,11 +293,9 @@ async fn main() -> Result<(), TensorRtLlmBackendError> { false, None, None, - messages_api_enabled, true, max_client_batch_size, - false, - false, + usage_stats, ) .await?; Ok(()) diff --git a/backends/trtllm/src/utils.rs b/backends/trtllm/src/utils.rs new file mode 100644 index 00000000000..4dedb007863 --- /dev/null +++ b/backends/trtllm/src/utils.rs @@ -0,0 +1,22 @@ +/// +/// Extract the first line of the provided string reference. +/// If there is no lines in the buffer, it returns a string +/// which content is defined by the content of `fail` +/// # Arguments +/// +/// * `s`: The string buffer to extract the first-line from +/// * `fail`: A string content which is returned if no lines are +/// present in `s` +/// +/// returns: String +/// +/// # Examples +/// +/// ``` +/// let s = "My name is Morgan.\n I'm working at Hugging Face."; +/// first_line(s, "No line in string"); +/// ``` +#[inline] +pub(crate) fn first_line(s: &str, fail: &str) -> String { + s.lines().next().unwrap_or(fail).to_string() +} diff --git a/backends/v2/src/backend.rs b/backends/v2/src/backend.rs index 086fc6dc4a6..bc264138d28 100644 --- a/backends/v2/src/backend.rs +++ b/backends/v2/src/backend.rs @@ -6,7 +6,7 @@ use nohash_hasher::IntMap; use std::sync::Arc; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::validation::ValidGenerateRequest; -use text_generation_router::{Attention, FinishReason, PrefillToken, Token}; +use text_generation_router::{FinishReason, PrefillToken, Token}; use tokio::sync::mpsc::error::SendError; use tokio::sync::{mpsc, Notify}; use tokio::time::Instant; @@ -36,18 +36,14 @@ impl BackendV2 { speculate: u32, ) -> Self { // Infer shared state - let attention = if let Ok(attention) = std::env::var("ATTENTION") { - attention - .parse() - .unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`")) - } else { - Attention::Paged - }; - let block_size = if attention == Attention::FlashDecoding { - 256 - } else { - 16 + let attention = std::env::var("ATTENTION").unwrap_or("paged".to_string()); + let block_size = match attention.as_str() { + "flashinfer" => 1, + "flashdecoding" => 256, + "paged" => 16, + _ => unreachable!(), }; + let queue = Queue::new(requires_padding, block_size, window_size, speculate); let batching_task_notifier = Arc::new(Notify::new()); diff --git a/backends/v2/src/main.rs b/backends/v2/src/main.rs index f53d898e659..ab4b7ce1d97 100644 --- a/backends/v2/src/main.rs +++ b/backends/v2/src/main.rs @@ -44,6 +44,8 @@ struct Args { tokenizer_config_path: Option, #[clap(long, env)] revision: Option, + #[clap(long, env, value_enum)] + trust_remote_code: bool, #[clap(default_value = "2", long, env)] validation_workers: usize, #[clap(long, env)] @@ -63,8 +65,6 @@ struct Args { #[clap(long, env)] ngrok_edge: Option, #[clap(long, env, default_value_t = false)] - messages_api_enabled: bool, - #[clap(long, env, default_value_t = false)] disable_grammar_support: bool, #[clap(default_value = "4", long, env)] max_client_batch_size: usize, @@ -101,6 +101,7 @@ async fn main() -> Result<(), RouterError> { tokenizer_name, tokenizer_config_path, revision, + trust_remote_code, validation_workers, api_key, json_output, @@ -110,7 +111,6 @@ async fn main() -> Result<(), RouterError> { ngrok, ngrok_authtoken, ngrok_edge, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats, @@ -184,13 +184,13 @@ async fn main() -> Result<(), RouterError> { tokenizer_name, tokenizer_config_path, revision, + trust_remote_code, hostname, port, cors_allow_origin, ngrok, ngrok_authtoken, ngrok_edge, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats, diff --git a/backends/v3/src/backend.rs b/backends/v3/src/backend.rs index f8a10ca2606..a5c0f5125b2 100644 --- a/backends/v3/src/backend.rs +++ b/backends/v3/src/backend.rs @@ -1,12 +1,14 @@ -use crate::client::{Batch, CachedBatch, ClientError, Generation, Health, ShardedClient}; /// Batching and inference logic +use crate::client::{ + Batch, CachedBatch, ClientError, Generation, Health, InfoResponse, ShardedClient, +}; use crate::queue::{Entry, Queue}; use async_trait::async_trait; use nohash_hasher::IntMap; use std::sync::Arc; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::validation::ValidGenerateRequest; -use text_generation_router::{Attention, FinishReason, PrefillToken, Token}; +use text_generation_router::{FinishReason, PrefillToken, Token}; use tokio::sync::mpsc::error::SendError; use tokio::sync::{mpsc, Notify}; use tokio::time::Instant; @@ -31,27 +33,22 @@ impl BackendV3 { max_batch_total_tokens: u32, max_waiting_tokens: usize, max_batch_size: Option, - requires_padding: bool, - window_size: Option, - speculate: u32, + shard_info: InfoResponse, ) -> Self { - let prefix_caching = - std::env::var("USE_PREFIX_CACHING").expect("Expect prefix caching env var"); - let prefix_caching = matches!(prefix_caching.as_str(), "true" | "1"); - let attention: String = std::env::var("ATTENTION").expect("attention env var"); + if shard_info.support_chunking { + tracing::warn!("Model supports prefill chunking. `waiting_served_ratio` and `max_waiting_tokens` will be ignored."); + } - let attention: Attention = attention - .parse() - .unwrap_or_else(|_| panic!("Invalid attention was specified :`{attention}`")); - let block_size = attention.block_size(); + let block_size = shard_info.block_size; let queue = Queue::new( - requires_padding, + shard_info.requires_padding, block_size, - prefix_caching, - window_size, - speculate, + shard_info.use_prefix_caching, + shard_info.window_size, + shard_info.speculate, max_batch_total_tokens, + shard_info.support_chunking, ); let batching_task_notifier = Arc::new(Notify::new()); @@ -63,6 +60,7 @@ impl BackendV3 { max_batch_total_tokens, max_waiting_tokens, max_batch_size, + shard_info.support_chunking, queue.clone(), batching_task_notifier.clone(), )); @@ -127,6 +125,7 @@ pub(crate) async fn batching_task( max_batch_total_tokens: u32, max_waiting_tokens: usize, max_batch_size: Option, + support_chunking: bool, queue: Queue, notifier: Arc, ) { @@ -147,7 +146,7 @@ pub(crate) async fn batching_task( ) .await { - let mut cached_batch = prefill(&mut client, batch, &mut entries) + let mut cached_batch = prefill(&mut client, batch, None, &mut entries) .instrument(span) .await; let mut waiting_tokens = 1; @@ -158,28 +157,44 @@ pub(crate) async fn batching_task( // Get current batch info let batch_size = batch.size; let batch_max_tokens = batch.max_tokens; + let current_tokens = batch.current_tokens; let mut batches = vec![batch]; metrics::gauge!("tgi_batch_current_size").set(batch_size as f64); metrics::gauge!("tgi_batch_current_max_tokens").set(batch_max_tokens as f64); - let min_size = if waiting_tokens >= max_waiting_tokens { - // If we didn't onboard any new requests since >= max_waiting_tokens, we try - // to add a new batch even though its size might be small - None + let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); + + let (min_size, max_size, prefill_token_budget) = if support_chunking { + // Since the next batch will be concatenated with the current batch, + // the current batch tokens must be subtracted to the prefill budget + let prefill_token_budget = + max_batch_prefill_tokens.saturating_sub(current_tokens); + // We can ignore min_size and max_size + // Models than rely on max_size cannot support chunking + // Regarding min_size, chunking allow us to consistently run at the compute + // bound, making min_size useless. + (None, None, prefill_token_budget) } else { - // Minimum batch size - // TODO: temporarily disable to avoid incorrect deallocation + - // reallocation when using prefix caching. - Some((batch_size as f32 * waiting_served_ratio).floor() as usize) - }; + let min_size = if waiting_tokens >= max_waiting_tokens { + // If we didn't onboard any new requests since >= max_waiting_tokens, we try + // to add a new batch even though its size might be small + None + } else { + // Minimum batch size + // TODO: temporarily disable to avoid incorrect deallocation + + // reallocation when using prefix caching. + Some((batch_size as f32 * waiting_served_ratio).floor() as usize) + }; - let token_budget = max_batch_total_tokens.saturating_sub(batch_max_tokens); - let max_size = - max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize)); + let max_size = + max_batch_size.map(|max_size| max_size.saturating_sub(batch_size as usize)); + + (min_size, max_size, max_batch_prefill_tokens) + }; // Try to get a new batch - if let Some((mut new_entries, new_batch, span)) = queue - .next_batch(min_size, max_size, max_batch_prefill_tokens, token_budget) + if let Some((new_entries, new_batch, span)) = queue + .next_batch(min_size, max_size, prefill_token_budget, token_budget) .await { // Tracking metrics @@ -187,31 +202,45 @@ pub(crate) async fn batching_task( metrics::counter!("tgi_batch_concat", "reason" => "backpressure") .increment(1); } else { - metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded") - .increment(1); + let counter = if support_chunking { + metrics::counter!("tgi_batch_concat", "reason" => "chunking") + } else { + metrics::counter!("tgi_batch_concat", "reason" => "wait_exceeded") + }; + counter.increment(1); } - - entries.iter_mut().for_each(|(_, entry)| { - // Create a new span to add the info that this entry is waiting - // because a new batch is being computed - let entry_waiting_span = info_span!(parent: &entry.span, "waiting"); - // Add relationships - span.follows_from(&entry_waiting_span); - entry_waiting_span.follows_from(&span); - // Update entry - entry.temp_span = Some(entry_waiting_span); - }); + let cached_batch = if support_chunking { + // Concat current batch to the new one + batches.pop() + } else { + // Request are waiting only if we don't support chunking + entries.iter_mut().for_each(|(_, entry)| { + // Create a new span to add the info that this entry is waiting + // because a new batch is being computed + let entry_waiting_span = info_span!(parent: &entry.span, "waiting"); + // Add relationships + span.follows_from(&entry_waiting_span); + entry_waiting_span.follows_from(&span); + // Update entry + entry.temp_span = Some(entry_waiting_span); + }); + None + }; + entries.extend(new_entries); // Generate one token for this new batch to have the attention past in cache - let new_cached_batch = prefill(&mut client, new_batch, &mut new_entries) - .instrument(span) - .await; + let new_cached_batch = + prefill(&mut client, new_batch, cached_batch, &mut entries) + .instrument(span) + .await; // Reset waiting counter waiting_tokens = 1; // Extend current batch with the new batch if let Some(new_cached_batch) = new_cached_batch { - entries.extend(new_entries); batches.push(new_cached_batch); + } else if support_chunking { + // New cached batch is empty, no work left + break; } } @@ -244,13 +273,14 @@ pub(crate) async fn batching_task( async fn prefill( client: &mut ShardedClient, batch: Batch, + cached_batch: Option, entries: &mut IntMap, ) -> Option { let start_time = Instant::now(); let batch_id = batch.id; metrics::counter!("tgi_batch_inference_count", "method" => "prefill").increment(1); - match client.prefill(batch).await { + match client.prefill(batch, cached_batch).await { Ok((generations, next_batch, timings)) => { let start_filtering_time = Instant::now(); // Send generated tokens and filter stopped entries @@ -259,6 +289,10 @@ async fn prefill( // Filter next batch and remove requests that were stopped let next_batch = filter_batch(client, next_batch, entries).await; + if let Some(concat_duration) = timings.concat { + metrics::histogram!("tgi_batch_concat_duration", "method" => "decode") + .record(concat_duration.as_secs_f64()); + } metrics::histogram!("tgi_batch_forward_duration", "method" => "prefill") .record(timings.forward.as_secs_f64()); metrics::histogram!("tgi_batch_decode_duration", "method" => "prefill") diff --git a/backends/v3/src/client/grpc_client.rs b/backends/v3/src/client/grpc_client.rs index 648662db39b..fe810f24742 100644 --- a/backends/v3/src/client/grpc_client.rs +++ b/backends/v3/src/client/grpc_client.rs @@ -158,7 +158,8 @@ impl Client { // Blocks and slots will be set on the server side if we use paged attention blocks: vec![], slots: vec![], - prefix_len: 0, + cache_len: 0, + chunk_len: None, // Set sampling parameters to also take these ops into account in the max memory parameters: Some(NextTokenChooserParameters { temperature: 0.9, @@ -217,13 +218,23 @@ impl Client { pub async fn prefill( &mut self, batch: Batch, + cached_batch: Option, ) -> Result<(Vec, Option, PrefillTimings)> { - let request = tonic::Request::new(PrefillRequest { batch: Some(batch) }).inject_context(); + let request = tonic::Request::new(PrefillRequest { + batch: Some(batch), + cached_batch, + }) + .inject_context(); let response = self.stub.prefill(request).await?.into_inner(); Ok(( response.generations, response.batch, - PrefillTimings::new(response.forward_ns, response.decode_ns, response.total_ns), + PrefillTimings::new( + response.concat_ns, + response.forward_ns, + response.decode_ns, + response.total_ns, + ), )) } @@ -252,14 +263,16 @@ impl Client { } pub struct PrefillTimings { + pub concat: Option, pub forward: Duration, pub decode: Duration, pub total: Duration, } impl PrefillTimings { - fn new(forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self { + fn new(concat_ns: Option, forward_ns: u64, decode_ns: u64, total_ns: u64) -> Self { Self { + concat: concat_ns.map(Duration::from_nanos), forward: Duration::from_nanos(forward_ns), decode: Duration::from_nanos(decode_ns), total: Duration::from_nanos(total_ns), diff --git a/backends/v3/src/client/mod.rs b/backends/v3/src/client/mod.rs index 755431f4633..d4ac50c9c46 100644 --- a/backends/v3/src/client/mod.rs +++ b/backends/v3/src/client/mod.rs @@ -29,15 +29,6 @@ pub trait Health { async fn model_health(&self) -> Result<()>; } -#[derive(Debug)] -pub struct ShardInfo { - pub requires_padding: bool, - pub dtype: String, - pub device_type: String, - pub window_size: Option, - pub speculate: u32, -} - #[derive(Error, Debug, Clone)] pub enum ClientError { #[error("Could not connect to Text Generation server: {0}")] diff --git a/backends/v3/src/client/sharded_client.rs b/backends/v3/src/client/sharded_client.rs index ea77a696648..e181cd28d2f 100644 --- a/backends/v3/src/client/sharded_client.rs +++ b/backends/v3/src/client/sharded_client.rs @@ -1,6 +1,6 @@ -use crate::client::{ClientError, Result}; +use crate::client::Health; /// Multi shard Client -use crate::client::{Health, ShardInfo}; +use crate::client::{ClientError, Result}; use crate::client::grpc_client::{DecodeTimings, PrefillTimings}; use crate::client::{ @@ -49,13 +49,13 @@ impl ShardedClient { /// Get the model info #[instrument(skip(self))] - pub async fn info(&mut self) -> Result { + pub async fn info(&mut self) -> Result { let futures: Vec<_> = self .clients .iter_mut() .map(|client| client.info()) .collect(); - join_all(futures).await.pop().unwrap().map(ShardInfo::from) + join_all(futures).await.pop().unwrap() } /// GRPC health check @@ -135,11 +135,12 @@ impl ShardedClient { pub async fn prefill( &mut self, batch: Batch, + cached_batch: Option, ) -> Result<(Vec, Option, PrefillTimings)> { let futures: Vec<_> = self .clients .iter_mut() - .map(|client| Box::pin(client.prefill(batch.clone()))) + .map(|client| Box::pin(client.prefill(batch.clone(), cached_batch.clone()))) .collect(); #[allow(clippy::type_complexity)] let results: Result, Option, PrefillTimings)>> = @@ -194,18 +195,6 @@ impl ShardedClient { } } -impl From for ShardInfo { - fn from(value: InfoResponse) -> Self { - Self { - requires_padding: value.requires_padding, - dtype: value.dtype, - device_type: value.device_type, - window_size: value.window_size, - speculate: value.speculate, - } - } -} - #[async_trait] impl Health for ShardedClient { async fn device_health(&self) -> Result<()> { @@ -246,8 +235,9 @@ impl Health for ShardedClient { // Block 0 is reserved for health checks blocks: vec![0], slots: (0..16).collect(), - prefix_len: 0, + cache_len: 0, adapter_id: None, + chunk_len: None, }; let batch = Batch { id: u64::MAX, @@ -256,7 +246,7 @@ impl Health for ShardedClient { max_tokens: 2, max_blocks: 1, }; - self.clone().prefill(batch).await?; + self.clone().prefill(batch, None).await?; Ok(()) } } diff --git a/backends/v3/src/lib.rs b/backends/v3/src/lib.rs index af66b21eb22..7daf9eaeca7 100644 --- a/backends/v3/src/lib.rs +++ b/backends/v3/src/lib.rs @@ -29,6 +29,14 @@ pub struct BackendInfo { pub max_waiting_tokens: usize, #[schema(nullable = true, example = "null")] pub max_batch_size: Option, + #[schema(example = "false")] + pub support_chunking: bool, + #[schema(example = "false")] + pub prefix_caching: bool, + #[schema(example = "flashinfer")] + pub attention_impl: String, + #[schema(example = "1")] + pub block_size: u32, } #[allow(clippy::too_many_arguments)] @@ -110,6 +118,10 @@ pub async fn connect_backend( model_device_type: shard_info.device_type.clone(), model_dtype: shard_info.dtype.clone(), speculate: shard_info.speculate as usize, + support_chunking: shard_info.support_chunking, + prefix_caching: shard_info.use_prefix_caching, + attention_impl: shard_info.attention_impl.clone(), + block_size: shard_info.block_size, }; let backend = BackendV3::new( @@ -119,9 +131,7 @@ pub async fn connect_backend( max_batch_total_tokens, max_waiting_tokens, max_batch_size, - shard_info.requires_padding, - shard_info.window_size, - shard_info.speculate, + shard_info, ); tracing::info!("Using backend V3"); diff --git a/backends/v3/src/main.rs b/backends/v3/src/main.rs index 471ddb5a70a..bc4bdb934eb 100644 --- a/backends/v3/src/main.rs +++ b/backends/v3/src/main.rs @@ -44,6 +44,8 @@ struct Args { tokenizer_config_path: Option, #[clap(long, env)] revision: Option, + #[clap(long, env, value_enum)] + trust_remote_code: bool, #[clap(default_value = "2", long, env)] validation_workers: usize, #[clap(long, env)] @@ -63,8 +65,6 @@ struct Args { #[clap(long, env)] ngrok_edge: Option, #[clap(long, env, default_value_t = false)] - messages_api_enabled: bool, - #[clap(long, env, default_value_t = false)] disable_grammar_support: bool, #[clap(default_value = "4", long, env)] max_client_batch_size: usize, @@ -101,6 +101,7 @@ async fn main() -> Result<(), RouterError> { tokenizer_name, tokenizer_config_path, revision, + trust_remote_code, validation_workers, api_key, json_output, @@ -110,7 +111,6 @@ async fn main() -> Result<(), RouterError> { ngrok, ngrok_authtoken, ngrok_edge, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats, @@ -131,25 +131,12 @@ async fn main() -> Result<(), RouterError> { "`max_input_tokens` must be < `max_total_tokens`".to_string(), )); } - if max_input_tokens as u32 > max_batch_prefill_tokens { - return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}"))); - } if validation_workers == 0 { return Err(RouterError::ArgumentValidation( "`validation_workers` must be > 0".to_string(), )); } - - if let Some(ref max_batch_total_tokens) = max_batch_total_tokens { - if max_batch_prefill_tokens > *max_batch_total_tokens { - return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}"))); - } - if max_total_tokens as u32 > *max_batch_total_tokens { - return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}"))); - } - } - if let Some(max_batch_size) = max_batch_size { if max_batch_size == 0 { return Err(RouterError::ArgumentValidation( @@ -158,7 +145,7 @@ async fn main() -> Result<(), RouterError> { } } - let (backend, _backend_info) = connect_backend( + let (backend, backend_info) = connect_backend( max_input_tokens, max_total_tokens, master_shard_uds_path, @@ -170,6 +157,19 @@ async fn main() -> Result<(), RouterError> { ) .await?; + // Validate remaining args now that the backend is known + let support_chunking = backend_info.support_chunking; + let max_batch_total_tokens = backend_info.max_batch_total_tokens; + if max_input_tokens as u32 > max_batch_prefill_tokens && !support_chunking { + return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}"))); + } + if max_batch_prefill_tokens > max_batch_total_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}"))); + } + if max_total_tokens as u32 > max_batch_total_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}"))); + } + // Run server server::run( backend, @@ -184,13 +184,13 @@ async fn main() -> Result<(), RouterError> { tokenizer_name, tokenizer_config_path, revision, + trust_remote_code, hostname, port, cors_allow_origin, ngrok, ngrok_authtoken, ngrok_edge, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats, diff --git a/backends/v3/src/queue.rs b/backends/v3/src/queue.rs index f8123b57aa2..6662b8de1f9 100644 --- a/backends/v3/src/queue.rs +++ b/backends/v3/src/queue.rs @@ -4,7 +4,7 @@ use crate::client::{ Batch, GrammarType, NextTokenChooserParameters, Request, StoppingCriteriaParameters, }; use nohash_hasher::{BuildNoHashHasher, IntMap}; -use std::cmp::{max, min}; +use std::cmp::max; use std::collections::VecDeque; use text_generation_router::infer::InferError; use text_generation_router::infer::InferStreamResponse; @@ -50,6 +50,7 @@ impl Queue { window_size: Option, speculate: u32, max_batch_total_tokens: u32, + support_chunking: bool, ) -> Self { // Create channel let (queue_sender, queue_receiver) = mpsc::unbounded_channel(); @@ -62,6 +63,7 @@ impl Queue { window_size, speculate, max_batch_total_tokens, + support_chunking, queue_receiver, )); @@ -87,6 +89,10 @@ impl Queue { prefill_token_budget: u32, token_budget: u32, ) -> Option { + if prefill_token_budget == 0 || token_budget == 0 { + return None; + }; + // Create response channel let (response_sender, response_receiver) = oneshot::channel(); // Send next batch command to the background task managing the state @@ -108,6 +114,7 @@ impl Queue { } // Background task responsible of the queue state +#[allow(clippy::too_many_arguments)] async fn queue_task( requires_padding: bool, block_size: u32, @@ -115,6 +122,7 @@ async fn queue_task( window_size: Option, speculate: u32, max_batch_total_tokens: u32, + support_chunking: bool, mut receiver: mpsc::UnboundedReceiver, ) { let mut state = State::new( @@ -124,6 +132,7 @@ async fn queue_task( window_size, speculate, max_batch_total_tokens, + support_chunking, ); while let Some(cmd) = receiver.recv().await { @@ -166,12 +175,14 @@ struct State { /// Paged Attention block size block_size: u32, - /// Sliding window - window_size: Option, - /// Speculation amount speculate: u32, + /// Whether the model allow the prefill chunking + /// If it does, the last request in the batch will be split to exactly match the prefill + /// token budget + support_chunking: bool, + /// Paged Attention Block Allocation block_allocator: Option, } @@ -184,6 +195,7 @@ impl State { window_size: Option, speculate: u32, max_batch_total_tokens: u32, + support_chunking: bool, ) -> Self { let block_allocator = (!requires_padding).then(|| { BlockAllocator::new( @@ -199,8 +211,8 @@ impl State { next_id: 0, next_batch_id: 0, block_size, - window_size, speculate, + support_chunking, block_allocator, } } @@ -287,32 +299,7 @@ impl State { } None } - Some(_block_allocator) => { - prefill_tokens += entry.request.input_length; - let max_new_tokens = match self.window_size { - None => entry.request.stopping_parameters.max_new_tokens, - Some(window_size) => min( - window_size.saturating_sub(entry.request.input_length), - entry.request.stopping_parameters.max_new_tokens, - ), - }; - decode_tokens += max_new_tokens; - - if prefill_tokens > prefill_token_budget - || (prefill_tokens + decode_tokens + self.speculate) > token_budget - { - // Entry is over budget - // Add it back to the front - tracing::debug!("Over budget: prefill_tokens={prefill_tokens} > {prefill_token_budget} || {prefill_tokens} + {decode_tokens} + {} > {token_budget}", self.speculate); - self.entries.push_front((id, entry)); - break; - } - - let tokens = entry.request.input_length - + entry.request.stopping_parameters.max_new_tokens - + self.speculate - - 1; - + Some(block_allocator) => { // If users wants the prefill logprobs, we cannot reuse the cache. // So no input_ids for the radix tree. let input_ids = if entry.request.decoder_input_details { @@ -321,10 +308,73 @@ impl State { entry.request.input_ids.clone() }; - Some((tokens, input_ids)) + let tokens = entry.request.input_length + + entry.request.stopping_parameters.max_new_tokens + + self.speculate + - 1; + tracing::debug!("Allocating {tokens} with {input_ids:?}"); + + let block_allocation = match block_allocator.allocate(tokens, input_ids).await { + None => { + // Entry is over budget + // Add it back to the front + tracing::debug!("Over budget: not enough free blocks"); + self.entries.push_front((id, entry)); + break 'entry_loop; + } + Some(mut block_allocation) => { + tracing::debug!("Allocation: {block_allocation:?}"); + max_blocks = max(max_blocks, block_allocation.blocks.len() as u32); + + if block_allocation.prefix_len == entry.request.input_length { + // The whole request was found in the radix trie + // However, for the transformer forward to work, we need to + // have at least one token of postfix. + block_allocation.prefix_len -= 1; + } + + block_allocation + } + }; + + let postfix_len = entry.request.input_length - block_allocation.prefix_len; + + if prefill_tokens + postfix_len > prefill_token_budget { + // Entry is over budget + if self.support_chunking { + // We support chunking, just set postfix_len to exactly match prefill_token_budget + let chunk_len = prefill_token_budget.saturating_sub(prefill_tokens); + if chunk_len > 0 { + // Push this entry inside the batch + batch.push((id, entry, Some(block_allocation), Some(chunk_len))); + } else { + // We cannot prefill even one token for this entry + // Add it back to the queue + self.entries.push_front((id, entry)); + } + tracing::debug!( + "Matched budget: prefill_tokens={} == {prefill_token_budget}", + prefill_tokens + postfix_len + ); + break 'entry_loop; + } else { + // We don't support chunking, this entry needs to go back to the buffer + // Add it back to the front + tracing::debug!( + "Over budget: prefill_tokens={} > {prefill_token_budget}", + prefill_tokens + postfix_len + ); + self.entries.push_front((id, entry)); + break 'entry_loop; + } + } + + prefill_tokens += postfix_len; + + Some(block_allocation) } }; - batch.push((id, entry, block_allocation)); + batch.push((id, entry, block_allocation, None)); if Some(batch.len()) == max_size { break; } @@ -342,7 +392,7 @@ impl State { // Batch is too small if batch.len() < min_size { // Add back entries to the queue in the correct order - for (id, entry, _) in batch.into_iter().rev() { + for (id, entry, _, _) in batch.into_iter().rev() { self.entries.push_front((id, entry)); } return None; @@ -353,29 +403,7 @@ impl State { let mut batch_entries = IntMap::with_capacity_and_hasher(self.entries.len(), BuildNoHashHasher::default()); - for (id, mut entry, block_allocation) in batch { - let block_allocation = if let (Some((tokens, input_ids)), Some(block_allocator)) = - (block_allocation, &self.block_allocator) - { - tracing::debug!("Allocating {tokens} with {input_ids:?}"); - match block_allocator.allocate(tokens, input_ids).await { - None => { - // Entry is over budget - // Add it back to the front - tracing::debug!("Over budget: not enough free blocks"); - self.entries.push_front((id, entry)); - continue; - } - Some(block_allocation) => { - tracing::debug!("Allocation: {block_allocation:?}"); - max_blocks = max(max_blocks, block_allocation.blocks.len() as u32); - Some(block_allocation) - } - } - } else { - None - }; - tracing::debug!("Accepting entry"); + for (id, mut entry, block_allocation, chunk_len) in batch { // Create a new span to link the batch back to this entry let entry_batch_span = info_span!(parent: &entry.span, "infer"); // Add relationships @@ -427,8 +455,9 @@ impl State { top_n_tokens: entry.request.top_n_tokens, blocks, slots, - prefix_len, + cache_len: prefix_len, adapter_id: entry.request.adapter_id.clone(), + chunk_len, }); // Set batch_time entry.batch_time = Some(Instant::now()); @@ -436,12 +465,6 @@ impl State { batch_entries.insert(id, entry); } - // Empty batch - if batch_requests.is_empty() { - tracing::debug!("Filterered out all entries"); - return None; - } - // Final batch size let size = batch_requests.len() as u32; next_batch_span.record("batch_size", size); @@ -531,7 +554,7 @@ mod tests { request: ValidGenerateRequest { inputs: vec![], input_ids: Some(Arc::new(vec![])), - input_length: 0, + input_length: 1, add_special_tokens: true, truncate: 0, decoder_input_details: false, @@ -567,7 +590,7 @@ mod tests { #[tokio::test] async fn test_append() { - let mut state = State::new(false, 1, false, None, 0, 16); + let mut state = State::new(false, 1, false, None, 0, 16, false); let (entry, _guard) = default_entry(); assert_eq!(state.next_id, 0); @@ -583,7 +606,7 @@ mod tests { #[tokio::test] async fn test_next_batch_empty() { - let mut state = State::new(false, 1, false, None, 0, 16); + let mut state = State::new(false, 1, false, None, 0, 16, false); assert!(state.next_batch(None, None, 1, 1).await.is_none()); assert!(state.next_batch(Some(1), None, 1, 1).await.is_none()); @@ -591,7 +614,7 @@ mod tests { #[tokio::test] async fn test_next_batch_min_size() { - let mut state = State::new(false, 1, false, None, 0, 16); + let mut state = State::new(false, 1, false, None, 0, 16, false); let (entry1, _guard1) = default_entry(); let (entry2, _guard2) = default_entry(); state.append(entry1); @@ -623,7 +646,7 @@ mod tests { #[tokio::test] async fn test_next_batch_max_size() { - let mut state = State::new(false, 1, false, None, 0, 16); + let mut state = State::new(false, 1, false, None, 0, 16, false); let (entry1, _guard1) = default_entry(); let (entry2, _guard2) = default_entry(); state.append(entry1); @@ -643,7 +666,7 @@ mod tests { #[tokio::test] async fn test_next_batch_token_budget() { - let mut state = State::new(false, 1, false, None, 0, 2); + let mut state = State::new(false, 1, false, None, 0, 16, false); let (entry1, _guard1) = default_entry(); let (entry2, _guard2) = default_entry(); state.append(entry1); @@ -676,14 +699,14 @@ mod tests { #[tokio::test] async fn test_queue_append() { - let queue = Queue::new(false, 1, false, None, 0, 16); + let queue = Queue::new(false, 1, false, None, 0, 16, false); let (entry, _guard) = default_entry(); queue.append(entry); } #[tokio::test] async fn test_queue_next_batch_empty() { - let queue = Queue::new(false, 1, false, None, 0, 16); + let queue = Queue::new(false, 1, false, None, 0, 16, false); assert!(queue.next_batch(None, None, 1, 1).await.is_none()); assert!(queue.next_batch(Some(1), None, 1, 1).await.is_none()); @@ -691,7 +714,7 @@ mod tests { #[tokio::test] async fn test_queue_next_batch_min_size() { - let queue = Queue::new(false, 1, false, None, 0, 16); + let queue = Queue::new(false, 1, false, None, 0, 16, false); let (entry1, _guard1) = default_entry(); let (entry2, _guard2) = default_entry(); queue.append(entry1); @@ -724,7 +747,7 @@ mod tests { #[tokio::test] async fn test_queue_next_batch_max_size() { - let queue = Queue::new(false, 1, false, None, 0, 16); + let queue = Queue::new(false, 1, false, None, 0, 16, false); let (entry1, _guard1) = default_entry(); let (entry2, _guard2) = default_entry(); queue.append(entry1); @@ -740,7 +763,7 @@ mod tests { #[tokio::test] async fn test_queue_next_batch_token_budget() { - let queue = Queue::new(false, 1, false, None, 0, 16); + let queue = Queue::new(false, 1, false, None, 0, 16, false); let (entry1, _guard1) = default_entry(); let (entry2, _guard2) = default_entry(); queue.append(entry1); @@ -765,7 +788,7 @@ mod tests { #[tokio::test] async fn test_queue_next_batch_token_speculate() { - let queue = Queue::new(false, 1, false, None, 2, 16); + let queue = Queue::new(true, 1, false, None, 2, 16, false); let (entry1, _guard1) = default_entry(); let (entry2, _guard2) = default_entry(); queue.append(entry1); @@ -784,7 +807,7 @@ mod tests { #[tokio::test] async fn test_queue_next_batch_dropped_receiver() { - let queue = Queue::new(false, 1, false, None, 0, 16); + let queue = Queue::new(false, 1, false, None, 0, 16, false); let (entry, _) = default_entry(); queue.append(entry); diff --git a/benchmark/src/generation.rs b/benchmark/src/generation.rs index 789c7b514fc..63fc780818b 100644 --- a/benchmark/src/generation.rs +++ b/benchmark/src/generation.rs @@ -158,7 +158,8 @@ async fn prefill( top_n_tokens: top_n_tokens.unwrap_or(0), blocks: vec![], slots: vec![], - prefix_len: 0, + cache_len: 0, + chunk_len: None, adapter_id: None, }) .collect(); @@ -173,7 +174,7 @@ async fn prefill( // Run prefill let start_time = Instant::now(); - let (_, decode_batch, _) = client.prefill(batch.clone()).await?; + let (_, decode_batch, _) = client.prefill(batch.clone(), None).await?; // Get latency let latency = start_time.elapsed(); diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs index 2ee3d7c551a..2e2e9a11c12 100644 --- a/benchmark/src/main.rs +++ b/benchmark/src/main.rs @@ -178,6 +178,7 @@ fn main() -> Result<(), Box> { .clear_cache(None) .await .expect("Unable to clear cache"); + tracing::info!("Connected"); // Run app diff --git a/docs/openapi.json b/docs/openapi.json index 957fe246b27..e7da2d40c54 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -316,6 +316,98 @@ } } }, + "/invocations": { + "post": { + "tags": [ + "Text Generation Inference" + ], + "summary": "Generate tokens from Sagemaker request", + "operationId": "sagemaker_compatibility", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SagemakerRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Generated Chat Completion", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/SagemakerResponse" + } + }, + "text/event-stream": { + "schema": { + "$ref": "#/components/schemas/SagemakerStreamResponse" + } + } + } + }, + "422": { + "description": "Input validation error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Input validation error", + "error_type": "validation" + } + } + } + }, + "424": { + "description": "Generation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Request failed during generation", + "error_type": "generation" + } + } + } + }, + "429": { + "description": "Model is overloaded", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Model is overloaded", + "error_type": "overloaded" + } + } + } + }, + "500": { + "description": "Incomplete generation", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + }, + "example": { + "error": "Incomplete generation", + "error_type": "incomplete_generation" + } + } + } + } + } + } + }, "/metrics": { "get": { "tags": [ @@ -1865,6 +1957,45 @@ "type": "string" } }, + "SagemakerRequest": { + "oneOf": [ + { + "$ref": "#/components/schemas/CompatGenerateRequest" + }, + { + "$ref": "#/components/schemas/ChatRequest" + }, + { + "$ref": "#/components/schemas/CompletionRequest" + } + ] + }, + "SagemakerResponse": { + "oneOf": [ + { + "$ref": "#/components/schemas/GenerateResponse" + }, + { + "$ref": "#/components/schemas/ChatCompletion" + }, + { + "$ref": "#/components/schemas/CompletionFinal" + } + ] + }, + "SagemakerStreamResponse": { + "oneOf": [ + { + "$ref": "#/components/schemas/StreamResponse" + }, + { + "$ref": "#/components/schemas/ChatCompletionChunk" + }, + { + "$ref": "#/components/schemas/Chunk" + } + ] + }, "SimpleToken": { "type": "object", "required": [ @@ -2186,4 +2317,4 @@ "description": "Hugging Face Text Generation Inference API" } ] -} \ No newline at end of file +} diff --git a/docs/source/reference/api_reference.md b/docs/source/reference/api_reference.md index 52043c80f8a..45d951bb1e1 100644 --- a/docs/source/reference/api_reference.md +++ b/docs/source/reference/api_reference.md @@ -141,9 +141,7 @@ TGI can be deployed on various cloud providers for scalable and robust text gene ## Amazon SageMaker -To enable the Messages API in Amazon SageMaker you need to set the environment variable `MESSAGES_API_ENABLED=true`. - -This will modify the `/invocations` route to accept Messages dictonaries consisting out of role and content. See the example below on how to deploy Llama with the new Messages API. +Amazon Sagemaker natively supports the message API: ```python import json @@ -161,12 +159,11 @@ except ValueError: hub = { 'HF_MODEL_ID':'HuggingFaceH4/zephyr-7b-beta', 'SM_NUM_GPUS': json.dumps(1), - 'MESSAGES_API_ENABLED': True } # create Hugging Face Model Class huggingface_model = HuggingFaceModel( - image_uri=get_huggingface_llm_image_uri("huggingface",version="1.4.0"), + image_uri=get_huggingface_llm_image_uri("huggingface",version="2.3.2"), env=hub, role=role, ) diff --git a/docs/source/reference/launcher.md b/docs/source/reference/launcher.md index b1abd1ee19b..68e487d0a73 100644 --- a/docs/source/reference/launcher.md +++ b/docs/source/reference/launcher.md @@ -93,10 +93,10 @@ Options: ## KV_CACHE_DTYPE ```shell --kv-cache-dtype - Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value is `fp8_e5m2` on CUDA + Specify the dtype for the key-value cache. When this option is not provided, the dtype of the model is used (typically `float16` or `bfloat16`). Currently the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA [env: KV_CACHE_DTYPE=] - [possible values: fp8_e5m2] + [possible values: fp8_e4m3fn, fp8_e5m2] ``` ## TRUST_REMOTE_CODE diff --git a/docs/source/supported_models.md b/docs/source/supported_models.md index 28008bcd1da..ede1fc778f3 100644 --- a/docs/source/supported_models.md +++ b/docs/source/supported_models.md @@ -8,6 +8,7 @@ Text Generation Inference enables serving optimized models. The following sectio - [Llava Next (1.6)](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) (Multimodal) - [Llama](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f) - [Phi 3](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) +- [Granite](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) - [Gemma](https://huggingface.co/google/gemma-7b) - [PaliGemma](https://huggingface.co/google/paligemma-3b-pt-224) - [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) diff --git a/docs/source/usage_statistics.md b/docs/source/usage_statistics.md index a2c406ecc0f..d3878b53ccb 100644 --- a/docs/source/usage_statistics.md +++ b/docs/source/usage_statistics.md @@ -26,7 +26,6 @@ As of release 2.1.2 this is an example of the data collected: "max_top_n_tokens": 5, "max_total_tokens": 2048, "max_waiting_tokens": 20, - "messages_api_enabled": false, "model_config": { "model_type": "Bloom" }, diff --git a/flake.lock b/flake.lock index aacdd30e401..76b4ca2fe38 100644 --- a/flake.lock +++ b/flake.lock @@ -978,15 +978,16 @@ "nixpkgs": "nixpkgs_6" }, "locked": { - "lastModified": 1728381423, - "narHash": "sha256-gpHy1WtlA8ZTd8XmxsdCoDd4Z7DE7co37lH7P+nsADA=", + "lastModified": 1729531056, + "narHash": "sha256-dW9IOA31+j3VS19WAWAmkJW2YCzeVZGqd6HpIJfODtI=", "owner": "huggingface", "repo": "text-generation-inference-nix", - "rev": "93123736c97e9f7bfe825bfaf3d7de0fc9a21a1e", + "rev": "a84a90281a17b15762873845c947e5c78f5a8dd1", "type": "github" }, "original": { "owner": "huggingface", + "ref": "marlin-kernels-0.3.0", "repo": "text-generation-inference-nix", "type": "github" } diff --git a/flake.nix b/flake.nix index edef442f2cb..5c05bfae7fb 100644 --- a/flake.nix +++ b/flake.nix @@ -5,7 +5,7 @@ inputs.nixpkgs.follows = "tgi-nix/nixpkgs"; }; nix-filter.url = "github:numtide/nix-filter"; - tgi-nix.url = "github:huggingface/text-generation-inference-nix"; + tgi-nix.url = "github:huggingface/text-generation-inference-nix/marlin-kernels-0.3.0"; nixpkgs.follows = "tgi-nix/nixpkgs"; flake-utils.url = "github:numtide/flake-utils"; rust-overlay = { @@ -137,6 +137,11 @@ impure = callPackage ./nix/impure-shell.nix { inherit server; }; + impureWithCuda = callPackage ./nix/impure-shell.nix { + inherit server; + withCuda = true; + }; + impure-flash-attn-v1 = callPackage ./nix/impure-shell.nix { server = server.override { flash-attn = python3.pkgs.flash-attn-v1; }; }; diff --git a/integration-tests/conftest.py b/integration-tests/conftest.py index f24fc0790ce..356fa5e30ac 100644 --- a/integration-tests/conftest.py +++ b/integration-tests/conftest.py @@ -9,13 +9,16 @@ import sys import tempfile import time -from typing import Dict, List, Optional - import docker import pytest +import base64 + +from pathlib import Path +from typing import Dict, List, Optional from aiohttp import ClientConnectorError, ClientOSError, ServerDisconnectedError from docker.errors import NotFound from syrupy.extensions.json import JSONSnapshotExtension + from text_generation import AsyncClient from text_generation.types import ( BestOfSequence, @@ -403,6 +406,7 @@ def local_launcher( print(" ".join(args), file=sys.stderr) env["LOG_LEVEL"] = "info,text_generation_router=debug" + env["PREFILL_CHUNKING"] = "1" if not use_flash_attention: env["USE_FLASH_ATTENTION"] = "false" @@ -501,6 +505,7 @@ def docker_launcher( env = { "LOG_LEVEL": "info,text_generation_router=debug", + "PREFILL_CHUNKING": "1", } if not use_flash_attention: env["USE_FLASH_ATTENTION"] = "false" @@ -642,3 +647,22 @@ async def generate_load_inner( return responses return generate_load_inner + + +# TODO fix the server parsser to count inline image tokens correctly +@pytest.fixture +def chicken(): + path = Path(__file__).parent / "images" / "chicken_on_money.png" + + with open(path, "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return f"data:image/png;base64,{encoded_string.decode('utf-8')}" + + +@pytest.fixture +def cow_beach(): + path = Path(__file__).parent / "images" / "cow_beach.png" + + with open(path, "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return f"data:image/png;base64,{encoded_string.decode('utf-8')}" diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json index c55dd593a1d..b82882c00b6 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache.json @@ -11,27 +11,27 @@ }, { "id": 3923, - "logprob": -5.6328125, + "logprob": -6.1875, "text": "What" }, { "id": 374, - "logprob": -1.2265625, + "logprob": -0.93359375, "text": " is" }, { "id": 5655, - "logprob": -9.1015625, + "logprob": -9.875, "text": " deep" }, { "id": 6975, - "logprob": -1.8085938, + "logprob": -1.1796875, "text": " learning" }, { "id": 30, - "logprob": -1.0439453, + "logprob": -1.75, "text": "?" } ], @@ -39,66 +39,66 @@ "tokens": [ { "id": 18682, - "logprob": -2.1992188, + "logprob": -1.109375, "special": false, "text": " Deep" }, { "id": 6975, - "logprob": -0.079956055, + "logprob": -0.005432129, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.2763672, + "logprob": -0.028808594, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.37548828, + "logprob": -0.013671875, "special": false, "text": " a" }, { "id": 27084, - "logprob": -1.4628906, + "logprob": -0.69921875, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.02885437, + "logprob": -0.0005874634, "special": false, "text": " of" }, { "id": 5780, - "logprob": -0.2565918, + "logprob": -0.026855469, "special": false, "text": " machine" }, { "id": 6975, - "logprob": -0.0063438416, + "logprob": -0.00020885468, "special": false, "text": " learning" }, { "id": 430, - "logprob": -1.3056641, + "logprob": -0.17773438, "special": false, "text": " that" }, { - "id": 374, - "logprob": -1.6035156, + "id": 18065, + "logprob": -0.703125, "special": false, - "text": " is" + "text": " involves" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that is" + "generated_text": " Deep learning is a subset of machine learning that involves" } diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json index d06d6e5662d..8bce3e108d5 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_all_params.json @@ -1,8 +1,8 @@ { "details": { "best_of_sequences": null, - "finish_reason": "eos_token", - "generated_tokens": 3, + "finish_reason": "length", + "generated_tokens": 10, "prefill": [ { "id": 128000, @@ -11,22 +11,22 @@ }, { "id": 374, - "logprob": -22.96875, + "logprob": -18.0, "text": " is" }, { "id": 5655, - "logprob": -10.71875, + "logprob": -11.75, "text": " deep" }, { "id": 6975, - "logprob": -2.6992188, + "logprob": -2.0625, "text": " learning" }, { "id": 30, - "logprob": -4.8398438, + "logprob": -6.0, "text": "?" } ], @@ -34,24 +34,66 @@ "tokens": [ { "id": 720, - "logprob": -0.4411621, + "logprob": 0.0, "special": false, "text": " \n" }, { - "id": 220, - "logprob": -0.35864258, + "id": 34564, + "logprob": -0.11279297, + "special": false, + "text": "Deep" + }, + { + "id": 6975, + "logprob": -0.16015625, "special": false, - "text": " " + "text": " learning" }, { - "id": 128001, + "id": 320, + "logprob": -0.25195312, + "special": false, + "text": " (" + }, + { + "id": 16931, + "logprob": -1.703125, + "special": false, + "text": "DL" + }, + { + "id": 8, "logprob": 0.0, - "special": true, - "text": "<|end_of_text|>" + "special": false, + "text": ")" + }, + { + "id": 374, + "logprob": -1.140625, + "special": false, + "text": " is" + }, + { + "id": 264, + "logprob": 0.0, + "special": false, + "text": " a" + }, + { + "id": 1207, + "logprob": -1.3125, + "special": false, + "text": " sub" + }, + { + "id": 2630, + "logprob": 0.0, + "special": false, + "text": "field" } ], "top_tokens": null }, - "generated_text": "What is deep learning? \n " + "generated_text": "What is deep learning? \nDeep learning (DL) is a subfield" } diff --git a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json index 46670819f99..c7acee467c6 100644 --- a/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json +++ b/integration-tests/models/__snapshots__/test_flash_llama_fp8_kv_cache/test_flash_llama_fp8_kv_cache_load.json @@ -12,27 +12,27 @@ }, { "id": 3923, - "logprob": -5.6328125, + "logprob": -6.1875, "text": "What" }, { "id": 374, - "logprob": -1.2265625, + "logprob": -0.93359375, "text": " is" }, { "id": 5655, - "logprob": -9.1015625, + "logprob": -9.875, "text": " deep" }, { "id": 6975, - "logprob": -1.8085938, + "logprob": -1.1796875, "text": " learning" }, { "id": 30, - "logprob": -1.0439453, + "logprob": -1.75, "text": "?" } ], @@ -40,68 +40,68 @@ "tokens": [ { "id": 18682, - "logprob": -2.1992188, + "logprob": -1.109375, "special": false, "text": " Deep" }, { "id": 6975, - "logprob": -0.07897949, + "logprob": -0.0047912598, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.27734375, + "logprob": -0.025512695, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.37402344, + "logprob": -0.012145996, "special": false, "text": " a" }, { "id": 27084, - "logprob": -1.4511719, + "logprob": -0.72265625, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.02909851, + "logprob": -0.0005760193, "special": false, "text": " of" }, { "id": 5780, - "logprob": -0.25854492, + "logprob": -0.02722168, "special": false, "text": " machine" }, { "id": 6975, - "logprob": -0.0061798096, + "logprob": -0.00023651123, "special": false, "text": " learning" }, { "id": 430, - "logprob": -1.3046875, + "logprob": -0.17285156, "special": false, "text": " that" }, { - "id": 374, - "logprob": -1.5537109, + "id": 18065, + "logprob": -0.703125, "special": false, - "text": " is" + "text": " involves" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that is" + "generated_text": " Deep learning is a subset of machine learning that involves" }, { "details": { @@ -116,27 +116,27 @@ }, { "id": 3923, - "logprob": -5.6328125, + "logprob": -6.21875, "text": "What" }, { "id": 374, - "logprob": -1.2265625, + "logprob": -0.95703125, "text": " is" }, { "id": 5655, - "logprob": -9.1015625, + "logprob": -9.9375, "text": " deep" }, { "id": 6975, - "logprob": -1.8085938, + "logprob": -1.1328125, "text": " learning" }, { "id": 30, - "logprob": -1.0439453, + "logprob": -1.75, "text": "?" } ], @@ -144,68 +144,68 @@ "tokens": [ { "id": 18682, - "logprob": -2.1992188, + "logprob": -1.1796875, "special": false, "text": " Deep" }, { "id": 6975, - "logprob": -0.07897949, + "logprob": -0.005432129, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.27734375, + "logprob": -0.02758789, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.37402344, + "logprob": -0.013366699, "special": false, "text": " a" }, { "id": 27084, - "logprob": -1.4511719, + "logprob": -0.6953125, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.02909851, + "logprob": -0.0004863739, "special": false, "text": " of" }, { "id": 5780, - "logprob": -0.25854492, + "logprob": -0.02709961, "special": false, "text": " machine" }, { "id": 6975, - "logprob": -0.0061798096, + "logprob": -0.00022506714, "special": false, "text": " learning" }, { "id": 430, - "logprob": -1.3046875, + "logprob": -0.19726562, "special": false, "text": " that" }, { - "id": 374, - "logprob": -1.5537109, + "id": 18065, + "logprob": -0.77734375, "special": false, - "text": " is" + "text": " involves" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that is" + "generated_text": " Deep learning is a subset of machine learning that involves" }, { "details": { @@ -220,27 +220,27 @@ }, { "id": 3923, - "logprob": -5.6328125, + "logprob": -6.21875, "text": "What" }, { "id": 374, - "logprob": -1.2265625, + "logprob": -0.95703125, "text": " is" }, { "id": 5655, - "logprob": -9.1015625, + "logprob": -9.9375, "text": " deep" }, { "id": 6975, - "logprob": -1.8085938, + "logprob": -1.1328125, "text": " learning" }, { "id": 30, - "logprob": -1.0439453, + "logprob": -1.75, "text": "?" } ], @@ -248,68 +248,68 @@ "tokens": [ { "id": 18682, - "logprob": -2.1992188, + "logprob": -1.1796875, "special": false, "text": " Deep" }, { "id": 6975, - "logprob": -0.07897949, + "logprob": -0.005432129, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.27734375, + "logprob": -0.02758789, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.37402344, + "logprob": -0.013366699, "special": false, "text": " a" }, { "id": 27084, - "logprob": -1.4511719, + "logprob": -0.6953125, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.02909851, + "logprob": -0.0004863739, "special": false, "text": " of" }, { "id": 5780, - "logprob": -0.25854492, + "logprob": -0.02709961, "special": false, "text": " machine" }, { "id": 6975, - "logprob": -0.0061798096, + "logprob": -0.00022506714, "special": false, "text": " learning" }, { "id": 430, - "logprob": -1.3046875, + "logprob": -0.19726562, "special": false, "text": " that" }, { - "id": 374, - "logprob": -1.5537109, + "id": 18065, + "logprob": -0.77734375, "special": false, - "text": " is" + "text": " involves" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that is" + "generated_text": " Deep learning is a subset of machine learning that involves" }, { "details": { @@ -324,27 +324,27 @@ }, { "id": 3923, - "logprob": -5.6328125, + "logprob": -6.21875, "text": "What" }, { "id": 374, - "logprob": -1.2265625, + "logprob": -0.95703125, "text": " is" }, { "id": 5655, - "logprob": -9.1015625, + "logprob": -9.9375, "text": " deep" }, { "id": 6975, - "logprob": -1.8085938, + "logprob": -1.1328125, "text": " learning" }, { "id": 30, - "logprob": -1.0439453, + "logprob": -1.75, "text": "?" } ], @@ -352,67 +352,67 @@ "tokens": [ { "id": 18682, - "logprob": -2.1992188, + "logprob": -1.1796875, "special": false, "text": " Deep" }, { "id": 6975, - "logprob": -0.07897949, + "logprob": -0.005432129, "special": false, "text": " learning" }, { "id": 374, - "logprob": -0.27734375, + "logprob": -0.02758789, "special": false, "text": " is" }, { "id": 264, - "logprob": -0.37402344, + "logprob": -0.013366699, "special": false, "text": " a" }, { "id": 27084, - "logprob": -1.4511719, + "logprob": -0.6953125, "special": false, "text": " subset" }, { "id": 315, - "logprob": -0.02909851, + "logprob": -0.0004863739, "special": false, "text": " of" }, { "id": 5780, - "logprob": -0.25854492, + "logprob": -0.02709961, "special": false, "text": " machine" }, { "id": 6975, - "logprob": -0.0061798096, + "logprob": -0.00022506714, "special": false, "text": " learning" }, { "id": 430, - "logprob": -1.3046875, + "logprob": -0.19726562, "special": false, "text": " that" }, { - "id": 374, - "logprob": -1.5537109, + "id": 18065, + "logprob": -0.77734375, "special": false, - "text": " is" + "text": " involves" } ], "top_tokens": null }, - "generated_text": " Deep learning is a subset of machine learning that is" + "generated_text": " Deep learning is a subset of machine learning that involves" } ] diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json index 993bdaddc02..b835bf0752d 100644 --- a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json +++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq.json @@ -10,80 +10,95 @@ "text": "" }, { - "id": 3735, - "logprob": -11.0078125, - "text": "Test" + "id": 1824, + "logprob": -9.2890625, + "text": "What" }, { - "id": 2159, - "logprob": -13.59375, - "text": "request" + "id": 349, + "logprob": -1.1503906, + "text": "is" + }, + { + "id": 3534, + "logprob": -9.5859375, + "text": "deep" + }, + { + "id": 5168, + "logprob": -1.3945312, + "text": "learning" + }, + { + "id": 28804, + "logprob": -0.4555664, + "text": "?" } ], "seed": null, "tokens": [ { "id": 13, - "logprob": -1.7089844, + "logprob": -0.6953125, "special": false, "text": "\n" }, { "id": 13, - "logprob": -0.68847656, + "logprob": -0.4777832, "special": false, "text": "\n" }, { - "id": 28771, - "logprob": -1.9394531, + "id": 23229, + "logprob": -0.13256836, "special": false, - "text": "#" + "text": "Deep" }, { - "id": 3735, - "logprob": -2.8808594, + "id": 5168, + "logprob": -0.023849487, "special": false, - "text": " Test" + "text": " learning" }, { - "id": 2159, - "logprob": -0.37280273, + "id": 349, + "logprob": -0.13977051, "special": false, - "text": " request" + "text": " is" }, { - "id": 13, - "logprob": -0.26098633, + "id": 264, + "logprob": -0.14489746, "special": false, - "text": "\n" + "text": " a" }, { - "id": 13, - "logprob": -0.0017137527, + "id": 19804, + "logprob": -0.63183594, "special": false, - "text": "\n" + "text": " subset" }, { - "id": 1064, - "logprob": -2.2695312, + "id": 302, + "logprob": -0.010314941, "special": false, - "text": "##" + "text": " of" }, { - "id": 3735, - "logprob": -1.9238281, + "id": 5599, + "logprob": -0.0635376, "special": false, - "text": " Test" + "text": " machine" }, { - "id": 2159, - "logprob": -0.48828125, + "id": 5168, + "logprob": -0.0028572083, "special": false, - "text": " request" + "text": " learning" } ], "top_tokens": null }, - "generated_text": "\n\n# Test request\n\n## Test request" + "generated_text": "\n\nDeep learning is a subset of machine learning" } diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json index 94411eefb40..77c8859907e 100644 --- a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json +++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_all_params.json @@ -10,80 +10,90 @@ "text": "" }, { - "id": 3735, - "logprob": -11.0078125, - "text": "Test" + "id": 349, + "logprob": -12.0546875, + "text": "is" }, { - "id": 2159, - "logprob": -13.59375, - "text": "request" + "id": 3534, + "logprob": -10.53125, + "text": "deep" + }, + { + "id": 5168, + "logprob": -2.71875, + "text": "learning" + }, + { + "id": 28804, + "logprob": -5.0078125, + "text": "?" } ], "seed": 0, "tokens": [ { "id": 13, - "logprob": -0.34838867, + "logprob": 0.0, "special": false, "text": "\n" }, { - "id": 13940, - "logprob": -0.38916016, + "id": 23229, + "logprob": -0.18237305, "special": false, - "text": "``" + "text": "Deep" }, { - "id": 28832, + "id": 17504, "logprob": 0.0, "special": false, - "text": "`" + "text": " Learning" }, { - "id": 3371, - "logprob": -1.2529297, + "id": 349, + "logprob": 0.0, "special": false, - "text": "json" + "text": " is" }, { - "id": 13, + "id": 264, "logprob": 0.0, "special": false, - "text": "\n" + "text": " a" }, { - "id": 28751, + "id": 19804, "logprob": 0.0, "special": false, - "text": "{" + "text": " subset" }, { - "id": 13, + "id": 302, "logprob": 0.0, "special": false, - "text": "\n" + "text": " of" }, { - "id": 2287, - "logprob": 0.0, + "id": 13253, + "logprob": -0.6040039, "special": false, - "text": " " + "text": " Machine" }, { - "id": 345, + "id": 17504, "logprob": 0.0, "special": false, - "text": " \"" + "text": " Learning" }, { - "id": 3134, - "logprob": -0.640625, + "id": 28725, + "logprob": -0.11621094, "special": false, - "text": "request" + "text": "," } ], "top_tokens": null }, - "generated_text": "Test request\n```json\n{\n \"request" + "generated_text": "What is deep learning?\nDeep Learning is a subset of Machine Learning," } diff --git a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json index 19e306a38f1..959e3c5578e 100644 --- a/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json +++ b/integration-tests/models/__snapshots__/test_flash_mixtral_gptq/test_flash_mixtral_gptq_load.json @@ -11,82 +11,97 @@ "text": "" }, { - "id": 3735, - "logprob": -11.0078125, - "text": "Test" + "id": 1824, + "logprob": -9.2890625, + "text": "What" }, { - "id": 2159, - "logprob": -13.59375, - "text": "request" + "id": 349, + "logprob": -1.1503906, + "text": "is" + }, + { + "id": 3534, + "logprob": -9.5859375, + "text": "deep" + }, + { + "id": 5168, + "logprob": -1.3945312, + "text": "learning" + }, + { + "id": 28804, + "logprob": -0.4555664, + "text": "?" } ], "seed": null, "tokens": [ { "id": 13, - "logprob": -1.7089844, + "logprob": -0.6953125, "special": false, "text": "\n" }, { "id": 13, - "logprob": -0.68847656, + "logprob": -0.4777832, "special": false, "text": "\n" }, { - "id": 28771, - "logprob": -1.9394531, + "id": 23229, + "logprob": -0.13232422, "special": false, - "text": "#" + "text": "Deep" }, { - "id": 3735, - "logprob": -2.8828125, + "id": 5168, + "logprob": -0.023834229, "special": false, - "text": " Test" + "text": " learning" }, { - "id": 2159, - "logprob": -0.37329102, + "id": 349, + "logprob": -0.13977051, "special": false, - "text": " request" + "text": " is" }, { - "id": 13, - "logprob": -0.2602539, + "id": 264, + "logprob": -0.14416504, "special": false, - "text": "\n" + "text": " a" }, { - "id": 13, - "logprob": -0.0017185211, + "id": 19804, + "logprob": -0.63183594, "special": false, - "text": "\n" + "text": " subset" }, { - "id": 1064, - "logprob": -2.2753906, + "id": 302, + "logprob": -0.010223389, "special": false, - "text": "##" + "text": " of" }, { - "id": 3735, - "logprob": -1.9316406, + "id": 5599, + "logprob": -0.064208984, "special": false, - "text": " Test" + "text": " machine" }, { - "id": 2159, - "logprob": -0.48217773, + "id": 5168, + "logprob": -0.0028266907, "special": false, - "text": " request" + "text": " learning" } ], "top_tokens": null }, - "generated_text": "\n\n# Test request\n\n## Test request" + "generated_text": "\n\nDeep learning is a subset of machine learning" }, { "details": { @@ -100,82 +115,97 @@ "text": "" }, { - "id": 3735, - "logprob": -11.0078125, - "text": "Test" + "id": 1824, + "logprob": -9.2890625, + "text": "What" }, { - "id": 2159, - "logprob": -13.59375, - "text": "request" + "id": 349, + "logprob": -1.1425781, + "text": "is" + }, + { + "id": 3534, + "logprob": -9.59375, + "text": "deep" + }, + { + "id": 5168, + "logprob": -1.390625, + "text": "learning" + }, + { + "id": 28804, + "logprob": -0.45532227, + "text": "?" } ], "seed": null, "tokens": [ { "id": 13, - "logprob": -1.7089844, + "logprob": -0.6953125, "special": false, "text": "\n" }, { "id": 13, - "logprob": -0.68847656, + "logprob": -0.48339844, "special": false, "text": "\n" }, { - "id": 28771, - "logprob": -1.9394531, + "id": 23229, + "logprob": -0.13256836, "special": false, - "text": "#" + "text": "Deep" }, { - "id": 3735, - "logprob": -2.8828125, + "id": 5168, + "logprob": -0.02420044, "special": false, - "text": " Test" + "text": " learning" }, { - "id": 2159, - "logprob": -0.37329102, + "id": 349, + "logprob": -0.13977051, "special": false, - "text": " request" + "text": " is" }, { - "id": 13, - "logprob": -0.2602539, + "id": 264, + "logprob": -0.14501953, "special": false, - "text": "\n" + "text": " a" }, { - "id": 13, - "logprob": -0.0017185211, + "id": 19804, + "logprob": -0.63134766, "special": false, - "text": "\n" + "text": " subset" }, { - "id": 1064, - "logprob": -2.2753906, + "id": 302, + "logprob": -0.010223389, "special": false, - "text": "##" + "text": " of" }, { - "id": 3735, - "logprob": -1.9316406, + "id": 5599, + "logprob": -0.06427002, "special": false, - "text": " Test" + "text": " machine" }, { - "id": 2159, - "logprob": -0.48217773, + "id": 5168, + "logprob": -0.002817154, "special": false, - "text": " request" + "text": " learning" } ], "top_tokens": null }, - "generated_text": "\n\n# Test request\n\n## Test request" + "generated_text": "\n\nDeep learning is a subset of machine learning" }, { "details": { @@ -189,82 +219,97 @@ "text": "" }, { - "id": 3735, - "logprob": -11.0078125, - "text": "Test" + "id": 1824, + "logprob": -9.2890625, + "text": "What" }, { - "id": 2159, - "logprob": -13.59375, - "text": "request" + "id": 349, + "logprob": -1.1425781, + "text": "is" + }, + { + "id": 3534, + "logprob": -9.59375, + "text": "deep" + }, + { + "id": 5168, + "logprob": -1.390625, + "text": "learning" + }, + { + "id": 28804, + "logprob": -0.45532227, + "text": "?" } ], "seed": null, "tokens": [ { "id": 13, - "logprob": -1.7089844, + "logprob": -0.6953125, "special": false, "text": "\n" }, { "id": 13, - "logprob": -0.68847656, + "logprob": -0.48339844, "special": false, "text": "\n" }, { - "id": 28771, - "logprob": -1.9394531, + "id": 23229, + "logprob": -0.13256836, "special": false, - "text": "#" + "text": "Deep" }, { - "id": 3735, - "logprob": -2.8828125, + "id": 5168, + "logprob": -0.02420044, "special": false, - "text": " Test" + "text": " learning" }, { - "id": 2159, - "logprob": -0.37329102, + "id": 349, + "logprob": -0.13977051, "special": false, - "text": " request" + "text": " is" }, { - "id": 13, - "logprob": -0.2602539, + "id": 264, + "logprob": -0.14501953, "special": false, - "text": "\n" + "text": " a" }, { - "id": 13, - "logprob": -0.0017185211, + "id": 19804, + "logprob": -0.63134766, "special": false, - "text": "\n" + "text": " subset" }, { - "id": 1064, - "logprob": -2.2753906, + "id": 302, + "logprob": -0.010223389, "special": false, - "text": "##" + "text": " of" }, { - "id": 3735, - "logprob": -1.9316406, + "id": 5599, + "logprob": -0.06427002, "special": false, - "text": " Test" + "text": " machine" }, { - "id": 2159, - "logprob": -0.48217773, + "id": 5168, + "logprob": -0.002817154, "special": false, - "text": " request" + "text": " learning" } ], "top_tokens": null }, - "generated_text": "\n\n# Test request\n\n## Test request" + "generated_text": "\n\nDeep learning is a subset of machine learning" }, { "details": { @@ -278,81 +323,96 @@ "text": "" }, { - "id": 3735, - "logprob": -11.0078125, - "text": "Test" + "id": 1824, + "logprob": -9.2890625, + "text": "What" }, { - "id": 2159, - "logprob": -13.59375, - "text": "request" + "id": 349, + "logprob": -1.1425781, + "text": "is" + }, + { + "id": 3534, + "logprob": -9.59375, + "text": "deep" + }, + { + "id": 5168, + "logprob": -1.390625, + "text": "learning" + }, + { + "id": 28804, + "logprob": -0.45532227, + "text": "?" } ], "seed": null, "tokens": [ { "id": 13, - "logprob": -1.7089844, + "logprob": -0.6953125, "special": false, "text": "\n" }, { "id": 13, - "logprob": -0.68847656, + "logprob": -0.48339844, "special": false, "text": "\n" }, { - "id": 28771, - "logprob": -1.9394531, + "id": 23229, + "logprob": -0.13256836, "special": false, - "text": "#" + "text": "Deep" }, { - "id": 3735, - "logprob": -2.8828125, + "id": 5168, + "logprob": -0.02420044, "special": false, - "text": " Test" + "text": " learning" }, { - "id": 2159, - "logprob": -0.37329102, + "id": 349, + "logprob": -0.13977051, "special": false, - "text": " request" + "text": " is" }, { - "id": 13, - "logprob": -0.2602539, + "id": 264, + "logprob": -0.14501953, "special": false, - "text": "\n" + "text": " a" }, { - "id": 13, - "logprob": -0.0017185211, + "id": 19804, + "logprob": -0.63134766, "special": false, - "text": "\n" + "text": " subset" }, { - "id": 1064, - "logprob": -2.2753906, + "id": 302, + "logprob": -0.010223389, "special": false, - "text": "##" + "text": " of" }, { - "id": 3735, - "logprob": -1.9316406, + "id": 5599, + "logprob": -0.06427002, "special": false, - "text": " Test" + "text": " machine" }, { - "id": 2159, - "logprob": -0.48217773, + "id": 5168, + "logprob": -0.002817154, "special": false, - "text": " request" + "text": " learning" } ], "top_tokens": null }, - "generated_text": "\n\n# Test request\n\n## Test request" + "generated_text": "\n\nDeep learning is a subset of machine learning" } ] diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json index 0d6dca31de8..cfabe3c65cd 100644 --- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json +++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe.json @@ -11,32 +11,32 @@ }, { "id": 338, - "logprob": -0.7133789, + "logprob": -0.6201172, "text": "is" }, { "id": 16030, - "logprob": -13.9296875, + "logprob": -13.6484375, "text": "gradient" }, { "id": 26815, - "logprob": -0.048919678, + "logprob": -0.003894806, "text": "descent" }, { "id": 29973, - "logprob": -3.0078125, + "logprob": -2.6386719, "text": "?" }, { "id": 13, - "logprob": -2.8105469, + "logprob": -6.46875, "text": "\n" }, { "id": 13, - "logprob": -0.84521484, + "logprob": -6.6875, "text": "\n" } ], @@ -44,66 +44,66 @@ "tokens": [ { "id": 25584, - "logprob": -0.017028809, + "logprob": -0.008979797, "special": false, "text": "Grad" }, { "id": 993, - "logprob": -0.0027313232, + "logprob": -8.34465e-07, "special": false, "text": "ient" }, { "id": 26815, - "logprob": -0.023254395, + "logprob": -0.0009407997, "special": false, "text": " descent" }, { "id": 338, - "logprob": -2.0623207e-05, + "logprob": -0.0003838539, "special": false, "text": " is" }, { - "id": 263, - "logprob": -0.5361328, + "id": 385, + "logprob": -0.24499512, "special": false, - "text": " a" + "text": " an" }, { - "id": 937, - "logprob": -0.17578125, + "id": 13883, + "logprob": -0.010406494, "special": false, - "text": " first" + "text": " optimization" }, { - "id": 29899, - "logprob": 0.0, + "id": 5687, + "logprob": -0.00024354458, "special": false, - "text": "-" + "text": " algorithm" }, { - "id": 2098, - "logprob": -0.00011539459, + "id": 15574, + "logprob": -0.6582031, "special": false, - "text": "order" + "text": " commonly" }, { - "id": 13883, - "logprob": -0.47436523, + "id": 1304, + "logprob": -0.00092840195, "special": false, - "text": " optimization" + "text": " used" }, { - "id": 5687, - "logprob": -0.00027680397, + "id": 297, + "logprob": -0.19470215, "special": false, - "text": " algorithm" + "text": " in" } ], "top_tokens": null }, - "generated_text": "Gradient descent is a first-order optimization algorithm" + "generated_text": "Gradient descent is an optimization algorithm commonly used in" } diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json index 38b803353d2..b524859fdc7 100644 --- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json +++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_all_params.json @@ -5,95 +5,95 @@ "generated_tokens": 10, "prefill": [ { - "id": 16030, + "id": 338, "logprob": null, + "text": "is" + }, + { + "id": 16030, + "logprob": -13.328125, "text": "gradient" }, { "id": 26815, - "logprob": -6.4960938, + "logprob": -0.24023438, "text": "descent" }, { "id": 29973, - "logprob": -5.1484375, + "logprob": -3.1386719, "text": "?" }, { "id": 13, - "logprob": -4.0351562, - "text": "\n" - }, - { - "id": 13, - "logprob": -5.2265625, + "logprob": -3.0878906, "text": "\n" } ], "seed": 0, "tokens": [ { - "id": 10994, - "logprob": -1.1542969, + "id": 25584, + "logprob": 0.0, "special": false, - "text": "Hello" + "text": "Grad" }, { - "id": 29991, + "id": 993, "logprob": 0.0, "special": false, - "text": "!" + "text": "ient" }, { - "id": 739, + "id": 2726, "logprob": 0.0, "special": false, - "text": " It" + "text": " Des" }, { - "id": 2444, - "logprob": -0.42260742, + "id": 1760, + "logprob": 0.0, "special": false, - "text": " seems" + "text": "cent" }, { - "id": 366, - "logprob": 0.0, + "id": 313, + "logprob": -0.12322998, "special": false, - "text": " you" + "text": " (" }, { - "id": 29915, + "id": 29954, "logprob": 0.0, "special": false, - "text": "'" + "text": "G" }, { - "id": 276, - "logprob": -0.9838867, + "id": 29928, + "logprob": 0.0, "special": false, - "text": "re" + "text": "D" }, { - "id": 3211, + "id": 29897, "logprob": 0.0, "special": false, - "text": " address" + "text": ")" }, { - "id": 292, - "logprob": 0.0, + "id": 338, + "logprob": -0.6040039, "special": false, - "text": "ing" + "text": " is" }, { - "id": 263, - "logprob": -0.15124512, + "id": 385, + "logprob": -0.1796875, "special": false, - "text": " a" + "text": " an" } ], "top_tokens": null }, - "generated_text": "What is gradient descent?\n\nHello! It seems you're addressing a" + "generated_text": "What is gradient descent?\nGradient Descent (GD) is an" } diff --git a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json index f1f81152c46..2c977d8b2be 100644 --- a/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json +++ b/integration-tests/models/__snapshots__/test_flash_phi35_moe/test_flash_phi35_moe_load.json @@ -12,32 +12,32 @@ }, { "id": 338, - "logprob": -0.7133789, + "logprob": -0.6201172, "text": "is" }, { "id": 16030, - "logprob": -13.9296875, + "logprob": -13.6484375, "text": "gradient" }, { "id": 26815, - "logprob": -0.048919678, + "logprob": -0.003894806, "text": "descent" }, { "id": 29973, - "logprob": -3.0078125, + "logprob": -2.6386719, "text": "?" }, { "id": 13, - "logprob": -2.8105469, + "logprob": -6.46875, "text": "\n" }, { "id": 13, - "logprob": -0.84521484, + "logprob": -6.6875, "text": "\n" } ], @@ -45,68 +45,68 @@ "tokens": [ { "id": 25584, - "logprob": -0.017028809, + "logprob": -0.008979797, "special": false, "text": "Grad" }, { "id": 993, - "logprob": -0.0028476715, + "logprob": -8.34465e-07, "special": false, "text": "ient" }, { "id": 26815, - "logprob": -0.023971558, + "logprob": -0.00097084045, "special": false, "text": " descent" }, { "id": 338, - "logprob": -2.0384789e-05, + "logprob": -0.0003838539, "special": false, "text": " is" }, { - "id": 263, - "logprob": -0.5229492, + "id": 385, + "logprob": -0.23840332, "special": false, - "text": " a" + "text": " an" }, { - "id": 937, - "logprob": -0.17602539, + "id": 13883, + "logprob": -0.010406494, "special": false, - "text": " first" + "text": " optimization" }, { - "id": 29899, - "logprob": 0.0, + "id": 5687, + "logprob": -0.0002501011, "special": false, - "text": "-" + "text": " algorithm" }, { - "id": 2098, - "logprob": -0.000116467476, + "id": 15574, + "logprob": -0.6582031, "special": false, - "text": "order" + "text": " commonly" }, { - "id": 13883, - "logprob": -0.47436523, + "id": 1304, + "logprob": -0.00092840195, "special": false, - "text": " optimization" + "text": " used" }, { - "id": 5687, - "logprob": -0.00027871132, + "id": 297, + "logprob": -0.18933105, "special": false, - "text": " algorithm" + "text": " in" } ], "top_tokens": null }, - "generated_text": "Gradient descent is a first-order optimization algorithm" + "generated_text": "Gradient descent is an optimization algorithm commonly used in" }, { "details": { @@ -121,32 +121,32 @@ }, { "id": 338, - "logprob": -0.7128906, + "logprob": -0.6113281, "text": "is" }, { "id": 16030, - "logprob": -13.9375, + "logprob": -13.6640625, "text": "gradient" }, { "id": 26815, - "logprob": -0.05053711, + "logprob": -0.003929138, "text": "descent" }, { "id": 29973, - "logprob": -3.0058594, + "logprob": -2.625, "text": "?" }, { "id": 13, - "logprob": -2.8242188, + "logprob": -6.484375, "text": "\n" }, { "id": 13, - "logprob": -0.84521484, + "logprob": -6.6875, "text": "\n" } ], @@ -154,68 +154,68 @@ "tokens": [ { "id": 25584, - "logprob": -0.018859863, + "logprob": -0.009017944, "special": false, "text": "Grad" }, { "id": 993, - "logprob": -0.002822876, + "logprob": -9.536743e-07, "special": false, "text": "ient" }, { "id": 26815, - "logprob": -0.023254395, + "logprob": -0.00097084045, "special": false, "text": " descent" }, { "id": 338, - "logprob": -2.0384789e-05, + "logprob": -0.0003838539, "special": false, "text": " is" }, { - "id": 263, - "logprob": -0.5229492, + "id": 385, + "logprob": -0.24499512, "special": false, - "text": " a" + "text": " an" }, { - "id": 937, - "logprob": -0.17126465, + "id": 13883, + "logprob": -0.010406494, "special": false, - "text": " first" + "text": " optimization" }, { - "id": 29899, - "logprob": 0.0, + "id": 5687, + "logprob": -0.0002501011, "special": false, - "text": "-" + "text": " algorithm" }, { - "id": 2098, - "logprob": -0.0001155138, + "id": 15574, + "logprob": -0.6435547, "special": false, - "text": "order" + "text": " commonly" }, { - "id": 13883, - "logprob": -0.47436523, + "id": 1304, + "logprob": -0.0009279251, "special": false, - "text": " optimization" + "text": " used" }, { - "id": 5687, - "logprob": -0.00027036667, + "id": 297, + "logprob": -0.18933105, "special": false, - "text": " algorithm" + "text": " in" } ], "top_tokens": null }, - "generated_text": "Gradient descent is a first-order optimization algorithm" + "generated_text": "Gradient descent is an optimization algorithm commonly used in" }, { "details": { @@ -230,32 +230,32 @@ }, { "id": 338, - "logprob": -0.71484375, + "logprob": -0.609375, "text": "is" }, { "id": 16030, - "logprob": -13.9375, + "logprob": -13.671875, "text": "gradient" }, { "id": 26815, - "logprob": -0.049346924, + "logprob": -0.0040016174, "text": "descent" }, { "id": 29973, - "logprob": -3.0078125, + "logprob": -2.6230469, "text": "?" }, { "id": 13, - "logprob": -2.8242188, + "logprob": -6.453125, "text": "\n" }, { "id": 13, - "logprob": -0.86328125, + "logprob": -6.6875, "text": "\n" } ], @@ -263,68 +263,68 @@ "tokens": [ { "id": 25584, - "logprob": -0.017196655, + "logprob": -0.008956909, "special": false, "text": "Grad" }, { "id": 993, - "logprob": -0.0028438568, + "logprob": -8.34465e-07, "special": false, "text": "ient" }, { "id": 26815, - "logprob": -0.023254395, + "logprob": -0.0009407997, "special": false, "text": " descent" }, { "id": 338, - "logprob": -2.026558e-05, + "logprob": -0.0003721714, "special": false, "text": " is" }, { - "id": 263, - "logprob": -0.5229492, + "id": 385, + "logprob": -0.24499512, "special": false, - "text": " a" + "text": " an" }, { - "id": 937, - "logprob": -0.17602539, + "id": 13883, + "logprob": -0.010406494, "special": false, - "text": " first" + "text": " optimization" }, { - "id": 29899, - "logprob": 0.0, + "id": 5687, + "logprob": -0.0002501011, "special": false, - "text": "-" + "text": " algorithm" }, { - "id": 2098, - "logprob": -0.00011622906, + "id": 15574, + "logprob": -0.6435547, "special": false, - "text": "order" + "text": " commonly" }, { - "id": 13883, - "logprob": -0.48608398, + "id": 1304, + "logprob": -0.00092601776, "special": false, - "text": " optimization" + "text": " used" }, { - "id": 5687, - "logprob": -0.00027894974, + "id": 297, + "logprob": -0.19177246, "special": false, - "text": " algorithm" + "text": " in" } ], "top_tokens": null }, - "generated_text": "Gradient descent is a first-order optimization algorithm" + "generated_text": "Gradient descent is an optimization algorithm commonly used in" }, { "details": { @@ -339,32 +339,32 @@ }, { "id": 338, - "logprob": -0.7192383, + "logprob": -0.609375, "text": "is" }, { "id": 16030, - "logprob": -13.9375, + "logprob": -13.6640625, "text": "gradient" }, { "id": 26815, - "logprob": -0.050445557, + "logprob": -0.0038967133, "text": "descent" }, { "id": 29973, - "logprob": -3.0078125, + "logprob": -2.6347656, "text": "?" }, { "id": 13, - "logprob": -2.8242188, + "logprob": -6.453125, "text": "\n" }, { "id": 13, - "logprob": -0.8276367, + "logprob": -6.6875, "text": "\n" } ], @@ -372,67 +372,67 @@ "tokens": [ { "id": 25584, - "logprob": -0.01727295, + "logprob": -0.008979797, "special": false, "text": "Grad" }, { "id": 993, - "logprob": -0.0027542114, + "logprob": -9.536743e-07, "special": false, "text": "ient" }, { "id": 26815, - "logprob": -0.023254395, + "logprob": -0.0009407997, "special": false, "text": " descent" }, { "id": 338, - "logprob": -2.0384789e-05, + "logprob": -0.00038409233, "special": false, "text": " is" }, { - "id": 263, - "logprob": -0.5229492, + "id": 385, + "logprob": -0.24499512, "special": false, - "text": " a" + "text": " an" }, { - "id": 937, - "logprob": -0.17126465, + "id": 13883, + "logprob": -0.010414124, "special": false, - "text": " first" + "text": " optimization" }, { - "id": 29899, - "logprob": 0.0, + "id": 5687, + "logprob": -0.00024354458, "special": false, - "text": "-" + "text": " algorithm" }, { - "id": 2098, - "logprob": -0.00011301041, + "id": 15574, + "logprob": -0.6435547, "special": false, - "text": "order" + "text": " commonly" }, { - "id": 13883, - "logprob": -0.48608398, + "id": 1304, + "logprob": -0.0009279251, "special": false, - "text": " optimization" + "text": " used" }, { - "id": 5687, - "logprob": -0.00027894974, + "id": 297, + "logprob": -0.19470215, "special": false, - "text": " algorithm" + "text": " in" } ], "top_tokens": null }, - "generated_text": "Gradient descent is a first-order optimization algorithm" + "generated_text": "Gradient descent is an optimization algorithm commonly used in" } ] diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json index 26224118c7e..8548e376c08 100644 --- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json +++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq.json @@ -11,57 +11,57 @@ }, { "id": 3226, - "logprob": -8.9453125, + "logprob": -9.0234375, "text": " ge" }, { "id": 21017, - "logprob": -8.8515625, + "logprob": -9.0859375, "text": "ometric" }, { "id": 81, - "logprob": -0.21875, + "logprob": -0.25585938, "text": "_" }, { "id": 6009, - "logprob": -1.2773438, + "logprob": -2.1972656, "text": "mean" }, { "id": 26, - "logprob": -0.25195312, + "logprob": -0.2998047, "text": "(" }, { "id": 62, - "logprob": -4.8203125, + "logprob": -5.6445312, "text": "L" }, { "id": 44, - "logprob": -3.7734375, + "logprob": -3.0839844, "text": ":" }, { "id": 1682, - "logprob": -0.8310547, + "logprob": -0.6748047, "text": " List" }, { "id": 77, - "logprob": -0.22766113, + "logprob": -0.3864746, "text": "[" }, { "id": 1808, - "logprob": -0.46240234, + "logprob": -0.9355469, "text": "float" }, { "id": 10794, - "logprob": -3.0234375, + "logprob": -2.5371094, "text": "]):" } ], @@ -69,7 +69,7 @@ "tokens": [ { "id": 284, - "logprob": -0.04626465, + "logprob": -1.1679688, "special": false, "text": "\n " }, diff --git a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json index 015912f8b61..a6b805342aa 100644 --- a/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json +++ b/integration-tests/models/__snapshots__/test_flash_starcoder_gptq/test_flash_starcoder_gptq_default_params.json @@ -11,57 +11,57 @@ }, { "id": 3226, - "logprob": -8.9453125, + "logprob": -9.015625, "text": " ge" }, { "id": 21017, - "logprob": -8.859375, + "logprob": -9.0859375, "text": "ometric" }, { "id": 81, - "logprob": -0.21984863, + "logprob": -0.25585938, "text": "_" }, { "id": 6009, - "logprob": -1.2861328, + "logprob": -2.2304688, "text": "mean" }, { "id": 26, - "logprob": -0.25219727, + "logprob": -0.29760742, "text": "(" }, { "id": 62, - "logprob": -4.8007812, + "logprob": -5.6796875, "text": "L" }, { "id": 44, - "logprob": -3.7949219, + "logprob": -3.0742188, "text": ":" }, { "id": 1682, - "logprob": -0.8046875, + "logprob": -0.67626953, "text": " List" }, { "id": 77, - "logprob": -0.22424316, + "logprob": -0.38842773, "text": "[" }, { "id": 1808, - "logprob": -0.46191406, + "logprob": -0.9165039, "text": "float" }, { "id": 10794, - "logprob": -3.0253906, + "logprob": -2.5527344, "text": "]):" } ], @@ -69,7 +69,7 @@ "tokens": [ { "id": 284, - "logprob": 0.0, + "logprob": -0.048583984, "special": false, "text": "\n " }, diff --git a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json index 40ec7e2f667..9fd950a219a 100644 --- a/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json +++ b/integration-tests/models/__snapshots__/test_mt0_base/test_mt0_base_all_params.json @@ -26,7 +26,7 @@ }, { "id": 259, - "logprob": -0.46948242, + "logprob": -0.47070312, "special": false, "text": " " }, @@ -38,7 +38,7 @@ }, { "id": 35622, - "logprob": -0.79589844, + "logprob": -0.796875, "special": false, "text": " cloud" }, @@ -75,5 +75,5 @@ ], "top_tokens": null }, - "generated_text": "Why is the sky blue?blue sky, clouds and clouds" + "generated_text": "Why is the sky blue?blue sky , clouds and clouds" } diff --git a/integration-tests/models/test_flash_llama_fp8_kv_cache.py b/integration-tests/models/test_flash_llama_fp8_kv_cache.py index 05e9f0dd9ec..ccd7f78fe6f 100644 --- a/integration-tests/models/test_flash_llama_fp8_kv_cache.py +++ b/integration-tests/models/test_flash_llama_fp8_kv_cache.py @@ -4,7 +4,9 @@ @pytest.fixture(scope="module") def flash_llama_fp8_kv_cache_handle(launcher): with launcher( - "meta-llama/Meta-Llama-3-8B", num_shard=2, kv_cache_dtype="fp8_e5m2" + "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", + num_shard=2, + kv_cache_dtype="fp8_e4m3fn", ) as handle: yield handle @@ -25,7 +27,7 @@ async def test_flash_llama_fp8_kv_cache(flash_llama_fp8_kv_cache, response_snaps assert ( response.generated_text - == " Deep learning is a subset of machine learning that is" + == " Deep learning is a subset of machine learning that involves" ) assert response.details.generated_tokens == 10 assert response == response_snapshot @@ -69,7 +71,7 @@ async def test_flash_llama_fp8_kv_cache_load( assert len(responses) == 4 assert ( responses[0].generated_text - == " Deep learning is a subset of machine learning that is" + == " Deep learning is a subset of machine learning that involves" ) assert all( [r.generated_text == responses[0].generated_text for r in responses] diff --git a/integration-tests/models/test_flash_mixtral_gptq.py b/integration-tests/models/test_flash_mixtral_gptq.py index eb880628481..47bcb0bf3d7 100644 --- a/integration-tests/models/test_flash_mixtral_gptq.py +++ b/integration-tests/models/test_flash_mixtral_gptq.py @@ -3,7 +3,11 @@ @pytest.fixture(scope="module") def flash_mixtral_gptq_handle(launcher): - with launcher("TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ", num_shard=2) as handle: + with launcher( + "TheBloke/Mixtral-8x7B-Instruct-v0.1-GPTQ", + revision="gptq-4bit-128g-actorder_True", + num_shard=2, + ) as handle: yield handle @@ -16,7 +20,12 @@ async def flash_mixtral_gptq(flash_mixtral_gptq_handle): @pytest.mark.asyncio async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot): response = await flash_mixtral_gptq.generate( - "Test request", max_new_tokens=10, decoder_input_details=True + "What is deep learning?", max_new_tokens=10, decoder_input_details=True + ) + + assert response.details.generated_tokens == 10 + assert ( + response.generated_text == "\n\nDeep learning is a subset of machine learning" ) assert response == response_snapshot @@ -25,7 +34,7 @@ async def test_flash_mixtral_gptq(flash_mixtral_gptq, response_snapshot): @pytest.mark.asyncio async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, response_snapshot): response = await flash_mixtral_gptq.generate( - "Test request", + "What is deep learning?", max_new_tokens=10, repetition_penalty=1.2, return_full_text=True, @@ -41,6 +50,10 @@ async def test_flash_mixtral_gptq_all_params(flash_mixtral_gptq, response_snapsh ) assert response.details.generated_tokens == 10 + assert ( + response.generated_text + == "What is deep learning?\nDeep Learning is a subset of Machine Learning," + ) assert response == response_snapshot @@ -49,10 +62,14 @@ async def test_flash_mixtral_gptq_load( flash_mixtral_gptq, generate_load, response_snapshot ): responses = await generate_load( - flash_mixtral_gptq, "Test request", max_new_tokens=10, n=4 + flash_mixtral_gptq, "What is deep learning?", max_new_tokens=10, n=4 ) assert len(responses) == 4 + assert ( + responses[0].generated_text + == "\n\nDeep learning is a subset of machine learning" + ) assert all( [r.generated_text == responses[0].generated_text for r in responses] ), f"{[r.generated_text for r in responses]}" diff --git a/integration-tests/models/test_flash_pali_gemma.py b/integration-tests/models/test_flash_pali_gemma.py index 52ecaed4612..93962eb3e8f 100644 --- a/integration-tests/models/test_flash_pali_gemma.py +++ b/integration-tests/models/test_flash_pali_gemma.py @@ -1,5 +1,4 @@ import pytest -import base64 @pytest.fixture(scope="module") @@ -20,24 +19,11 @@ async def flash_pali_gemma(flash_pali_gemma_handle): return flash_pali_gemma_handle.client -def get_chicken(): - with open("integration-tests/images/chicken_on_money.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" - - -def get_cow_beach(): - with open("integration-tests/images/cow_beach.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" - - @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private -async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot): - cow = get_cow_beach() - inputs = f"![]({cow})Where is the cow standing?\n" +async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot, cow_beach): + inputs = f"![]({cow_beach})Where is the cow standing?\n" response = await flash_pali_gemma.generate(inputs, max_new_tokens=20) assert response.generated_text == "beach" @@ -47,9 +33,9 @@ async def test_flash_pali_gemma(flash_pali_gemma, response_snapshot): @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private -async def test_flash_pali_gemma_two_images(flash_pali_gemma, response_snapshot): - chicken = get_chicken() - cow_beach = get_cow_beach() +async def test_flash_pali_gemma_two_images( + flash_pali_gemma, response_snapshot, chicken, cow_beach +): response = await flash_pali_gemma.generate( f"caption![]({chicken})![]({cow_beach})\n", max_new_tokens=20, diff --git a/integration-tests/models/test_flash_phi35_moe.py b/integration-tests/models/test_flash_phi35_moe.py index 2173740ad3c..d3043b028a8 100644 --- a/integration-tests/models/test_flash_phi35_moe.py +++ b/integration-tests/models/test_flash_phi35_moe.py @@ -25,7 +25,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot): assert response.details.generated_tokens == 10 assert ( response.generated_text - == "Gradient descent is a first-order optimization algorithm" + == "Gradient descent is an optimization algorithm commonly used in" ) assert response == response_snapshot @@ -33,7 +33,7 @@ async def test_flash_phi35_moe(flash_phi35_moe, response_snapshot): @pytest.mark.asyncio async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot): response = await flash_phi35_moe.generate( - "What is gradient descent?\n\n", + "What is gradient descent?\n", max_new_tokens=10, repetition_penalty=1.2, return_full_text=True, @@ -51,7 +51,7 @@ async def test_flash_phi35_moe_all_params(flash_phi35_moe, response_snapshot): assert response.details.generated_tokens == 10 assert ( response.generated_text - == "What is gradient descent?\n\nHello! It seems you're addressing a" + == "What is gradient descent?\nGradient Descent (GD) is an" ) assert response == response_snapshot @@ -66,7 +66,7 @@ async def test_flash_phi35_moe_load(flash_phi35_moe, generate_load, response_sna assert responses[0].details.generated_tokens == 10 assert ( responses[0].generated_text - == "Gradient descent is a first-order optimization algorithm" + == "Gradient descent is an optimization algorithm commonly used in" ) assert all( [r.generated_text == responses[0].generated_text for r in responses] diff --git a/integration-tests/models/test_grammar_response_format_llama.py b/integration-tests/models/test_grammar_response_format_llama.py index 25bf9d98eff..eb3268cea4f 100644 --- a/integration-tests/models/test_grammar_response_format_llama.py +++ b/integration-tests/models/test_grammar_response_format_llama.py @@ -25,7 +25,6 @@ async def llama_grammar(llama_grammar_handle): @pytest.mark.release @pytest.mark.asyncio async def test_grammar_response_format_llama_json(llama_grammar, response_snapshot): - class Weather(BaseModel): unit: str temperature: List[int] diff --git a/integration-tests/models/test_idefics.py b/integration-tests/models/test_idefics.py index eb573385b7c..e5d08bb74c6 100644 --- a/integration-tests/models/test_idefics.py +++ b/integration-tests/models/test_idefics.py @@ -1,5 +1,4 @@ import pytest -import base64 @pytest.fixture(scope="module") @@ -16,22 +15,8 @@ async def idefics(idefics_handle): return idefics_handle.client -# TODO fix the server parsser to count inline image tokens correctly -def get_chicken(): - with open("integration-tests/images/chicken_on_money.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" - - -def get_cow_beach(): - with open("integration-tests/images/cow_beach.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" - - @pytest.mark.asyncio -async def test_idefics(idefics, response_snapshot): - chicken = get_chicken() +async def test_idefics(idefics, response_snapshot, chicken): response = await idefics.generate( f"User:![]({chicken})Can you tell me a very short story based on the image?", max_new_tokens=10, @@ -48,9 +33,7 @@ async def test_idefics(idefics, response_snapshot): @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private -async def test_idefics_two_images(idefics, response_snapshot): - chicken = get_chicken() - cow_beach = get_cow_beach() +async def test_idefics_two_images(idefics, response_snapshot, chicken, cow_beach): response = await idefics.generate( f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken? \nAssistant:", max_new_tokens=20, @@ -63,8 +46,7 @@ async def test_idefics_two_images(idefics, response_snapshot): @pytest.mark.release @pytest.mark.asyncio -async def test_idefics_load(idefics, generate_load, response_snapshot): - chicken = get_chicken() +async def test_idefics_load(idefics, generate_load, response_snapshot, chicken): responses = await generate_load( idefics, f"User:![]({chicken})Can you tell me a very short story based on the image?", diff --git a/integration-tests/models/test_idefics2.py b/integration-tests/models/test_idefics2.py index c5f48da3525..881e37f9b95 100644 --- a/integration-tests/models/test_idefics2.py +++ b/integration-tests/models/test_idefics2.py @@ -1,18 +1,4 @@ import pytest -import base64 - - -# TODO fix the server parsser to count inline image tokens correctly -def get_chicken(): - with open("integration-tests/images/chicken_on_money.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" - - -def get_cow_beach(): - with open("integration-tests/images/cow_beach.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" @pytest.fixture(scope="module") @@ -31,8 +17,9 @@ async def flash_idefics2_next(flash_idefics2_next_handle): @pytest.mark.asyncio @pytest.mark.private -async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot): - chicken = get_chicken() +async def test_flash_idefics2_next_simple( + flash_idefics2_next, response_snapshot, chicken +): response = await flash_idefics2_next.generate( f"User:![]({chicken})Write me a short story \nAssistant:", max_new_tokens=10, @@ -46,9 +33,9 @@ async def test_flash_idefics2_next_simple(flash_idefics2_next, response_snapshot @pytest.mark.asyncio @pytest.mark.private -async def test_flash_idefics2_two_images(flash_idefics2_next, response_snapshot): - chicken = get_chicken() - cow_beach = get_cow_beach() +async def test_flash_idefics2_two_images( + flash_idefics2_next, response_snapshot, chicken, cow_beach +): response = await flash_idefics2_next.generate( f"User:![]({chicken})![]({cow_beach})Where are the cow and chicken? \nAssistant:", max_new_tokens=20, @@ -87,9 +74,8 @@ async def test_flash_idefics2_next_all_params(flash_idefics2_next, response_snap @pytest.mark.asyncio @pytest.mark.private async def test_flash_idefics2_next_load( - flash_idefics2_next, generate_load, response_snapshot + flash_idefics2_next, generate_load, response_snapshot, chicken ): - chicken = get_chicken() responses = await generate_load( flash_idefics2_next, f"User:![]({chicken})Write me a short story \nAssistant:", diff --git a/integration-tests/models/test_llava_next.py b/integration-tests/models/test_llava_next.py index ea277d713e0..1ac8f172db7 100644 --- a/integration-tests/models/test_llava_next.py +++ b/integration-tests/models/test_llava_next.py @@ -1,12 +1,4 @@ import pytest -import base64 - - -# TODO fix the server parsser to count inline image tokens correctly -def get_chicken(): - with open("integration-tests/images/chicken_on_money.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" @pytest.fixture(scope="module") @@ -29,8 +21,7 @@ async def flash_llava_next(flash_llava_next_handle): @pytest.mark.release @pytest.mark.asyncio @pytest.mark.private -async def test_flash_llava_next_simple(flash_llava_next, response_snapshot): - chicken = get_chicken() +async def test_flash_llava_next_simple(flash_llava_next, response_snapshot, chicken): response = await flash_llava_next.generate( f"User:![]({chicken})Can you tell me a very short story based on the image?", max_new_tokens=10, @@ -70,9 +61,8 @@ async def test_flash_llava_next_all_params(flash_llava_next, response_snapshot): @pytest.mark.asyncio @pytest.mark.private async def test_flash_llava_next_load( - flash_llava_next, generate_load, response_snapshot + flash_llava_next, generate_load, response_snapshot, chicken ): - chicken = get_chicken() responses = await generate_load( flash_llava_next, f"User:![]({chicken})Can you tell me a very short story based on the image?", diff --git a/integration-tests/models/test_mllama.py b/integration-tests/models/test_mllama.py index 1b4264aacd1..02781707e05 100644 --- a/integration-tests/models/test_mllama.py +++ b/integration-tests/models/test_mllama.py @@ -1,5 +1,4 @@ import pytest -import base64 import asyncio @@ -15,22 +14,8 @@ async def mllama(mllama_handle): return mllama_handle.client -# TODO fix the server parsser to count inline image tokens correctly -def get_chicken(): - with open("integration-tests/images/chicken_on_money.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" - - -def get_cow_beach(): - with open("integration-tests/images/cow_beach.png", "rb") as image_file: - encoded_string = base64.b64encode(image_file.read()) - return f"data:image/png;base64,{encoded_string.decode('utf-8')}" - - @pytest.mark.asyncio async def test_mllama_simpl(mllama, response_snapshot): - # chicken = get_chicken() response = await mllama.chat( max_tokens=10, temperature=0.0, diff --git a/launcher/src/main.rs b/launcher/src/main.rs index 42962dffb7b..71bbcbd8cd9 100644 --- a/launcher/src/main.rs +++ b/launcher/src/main.rs @@ -68,7 +68,7 @@ fn get_config( fn resolve_attention(config: &Option, lora_adapters: &Option) -> (String, String) { let compute_capability = gpu::get_cuda_capability(); - let mut prefix_caching: Option = std::env::var("USE_PREFIX_CACHING").ok(); + let mut prefix_caching: Option = std::env::var("PREFIX_CACHING").ok(); let mut attention: Option = std::env::var("ATTENTION").ok(); if let Some(config) = config { if prefix_caching.is_none() { @@ -94,7 +94,7 @@ fn resolve_attention(config: &Option, lora_adapters: &Option) -> prefix_caching = Some("0".to_string()); } match config.model_type.as_deref() { - Some("gemma2") | Some("falcon") | Some("deepseek_v2") => { + Some("falcon") | Some("deepseek_v2") => { // Required because gemma2 needs bfloat16 which is not supported by // flashinfer ? if attention.is_none() { @@ -124,6 +124,10 @@ fn resolve_attention(config: &Option, lora_adapters: &Option) -> } } } + if attention == Some("paged".to_string()) && prefix_caching.is_none() { + tracing::info!("Disabling prefix caching on paged attention"); + prefix_caching = Some("0".to_string()); + } let attention = attention.unwrap_or("flashinfer".to_string()); let prefix_caching = prefix_caching.unwrap_or("true".to_string()); @@ -303,6 +307,9 @@ impl std::fmt::Display for Dtype { #[derive(Clone, Copy, Debug, ValueEnum)] enum KVCacheDtype { + #[clap(name = "fp8_e4m3fn")] + Fp8e4m3fn, + #[clap(name = "fp8_e5m2")] Fp8e5m2, } @@ -310,6 +317,9 @@ enum KVCacheDtype { impl std::fmt::Display for KVCacheDtype { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { + KVCacheDtype::Fp8e4m3fn => { + write!(f, "fp8_e4m3fn") + } KVCacheDtype::Fp8e5m2 => { write!(f, "fp8_e5m2") } @@ -420,7 +430,7 @@ struct Args { /// Specify the dtype for the key-value cache. When this option is not provided, /// the dtype of the model is used (typically `float16` or `bfloat16`). Currently - /// the only supported value is `fp8_e5m2` on CUDA. + /// the only supported value are `fp8_e4m3fn` and `fp8_e5m2` on CUDA. #[clap(long, env, value_enum)] kv_cache_dtype: Option, @@ -1094,6 +1104,8 @@ fn log_lines(mut bufread: BufReader) { } } } + } else { + break; } } } @@ -1497,6 +1509,10 @@ fn spawn_webserver( router_args.push(revision.to_string()) } + if args.trust_remote_code { + router_args.push("--trust-remote-code".to_string()); + } + if args.json_output { router_args.push("--json-output".to_string()); } @@ -1678,7 +1694,7 @@ fn main() -> Result<(), LauncherError> { }; let (prefix_caching, attention) = resolve_attention(&config, &args.lora_adapters); tracing::info!("Using attention {attention} - Prefix caching {prefix_caching}"); - std::env::set_var("USE_PREFIX_CACHING", prefix_caching); + std::env::set_var("PREFIX_CACHING", prefix_caching); std::env::set_var("ATTENTION", attention); let max_input_tokens = { @@ -1729,12 +1745,6 @@ fn main() -> Result<(), LauncherError> { "`max_input_tokens must be < `max_total_tokens`".to_string(), )); } - if max_input_tokens as u32 > max_batch_prefill_tokens { - return Err(LauncherError::ArgumentValidation(format!( - "`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {} and {}", - max_batch_prefill_tokens, max_input_tokens - ))); - } if matches!(args.quantize, Some(Quantization::Bitsandbytes)) { tracing::warn!("Bitsandbytes is deprecated, use `eetq` instead, which provides better latencies overall and is drop-in in most cases."); @@ -1788,12 +1798,6 @@ fn main() -> Result<(), LauncherError> { } if let Some(ref max_batch_total_tokens) = args.max_batch_total_tokens { - if max_batch_prefill_tokens > *max_batch_total_tokens { - return Err(LauncherError::ArgumentValidation(format!( - "`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}", - max_batch_prefill_tokens, max_batch_total_tokens - ))); - } if max_total_tokens as u32 > *max_batch_total_tokens { return Err(LauncherError::ArgumentValidation(format!( "`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {} and {}", diff --git a/nix/impure-shell.nix b/nix/impure-shell.nix index abed544a155..92e14bc3cb0 100644 --- a/nix/impure-shell.nix +++ b/nix/impure-shell.nix @@ -1,7 +1,12 @@ { + lib, mkShell, black, + cmake, isort, + ninja, + which, + cudaPackages, openssl, pkg-config, protobuf, @@ -11,14 +16,17 @@ ruff, rust-bin, server, + + # Enable dependencies for building CUDA packages. Useful for e.g. + # developing marlin/moe-kernels in-place. + withCuda ? false, }: mkShell { - buildInputs = + nativeBuildInputs = [ black isort - openssl.dev pkg-config (rust-bin.stable.latest.default.override { extensions = [ @@ -31,6 +39,19 @@ mkShell { redocly ruff ] + ++ (lib.optionals withCuda [ + cmake + ninja + which + + # For most Torch-based extensions, setting CUDA_HOME is enough, but + # some custom CMake builds (e.g. vLLM) also need to have nvcc in PATH. + cudaPackages.cuda_nvcc + ]); + buildInputs = + [ + openssl.dev + ] ++ (with python3.pkgs; [ venvShellHook docker @@ -40,10 +61,29 @@ mkShell { pytest pytest-asyncio syrupy - ]); + ]) + ++ (lib.optionals withCuda ( + with cudaPackages; + [ + cuda_cccl + cuda_cudart + cuda_nvrtc + cuda_nvtx + cuda_profiler_api + cudnn + libcublas + libcusolver + libcusparse + ] + )); inputsFrom = [ server ]; + env = lib.optionalAttrs withCuda { + CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; + TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" python3.pkgs.torch.cudaCapabilities; + }; + venvDir = "./.venv"; postVenvCreation = '' @@ -51,6 +91,7 @@ mkShell { ( cd server ; python -m pip install --no-dependencies -e . ) ( cd clients/python ; python -m pip install --no-dependencies -e . ) ''; + postShellHook = '' unset SOURCE_DATE_EPOCH export PATH=$PATH:~/.cargo/bin diff --git a/proto/v3/generate.proto b/proto/v3/generate.proto index 34894bdaba4..c91e7cc43b2 100644 --- a/proto/v3/generate.proto +++ b/proto/v3/generate.proto @@ -34,6 +34,10 @@ message InfoResponse { string device_type = 3; optional uint32 window_size = 4; uint32 speculate = 5; + bool support_chunking = 6; + bool use_prefix_caching = 7; + string attention_impl = 8; + uint32 block_size = 9; } /// Empty request @@ -135,10 +139,14 @@ message Request { repeated uint32 slots = 10; /// LORA adapter index optional string adapter_id = 11; - /// Prefix length that can be retrieved from the KV cache. - uint32 prefix_len = 12; + /// Tokens that can be retrieved from the KV cache. + /// This value is set for the first prefill and never reset + uint32 cache_len = 12; /// Context truncation bool add_special_tokens = 13; + /// Chunk of tokens that must be computed for the first prefill + /// This value is set for the first prefill and never reset + optional uint32 chunk_len = 14; } message Batch { @@ -163,6 +171,8 @@ message CachedBatch { uint32 size = 3; /// Maximum number of tokens this batch will grow to uint32 max_tokens = 4; + /// Number of tokens in the next forward + uint32 current_tokens = 5; } enum FinishReason { @@ -220,6 +230,8 @@ message FilterBatchResponse { message PrefillRequest { /// Batch Batch batch = 1; + /// Optional cached batch + CachedBatch cached_batch = 2; } message PrefillResponse { @@ -233,6 +245,8 @@ message PrefillResponse { uint64 decode_ns = 4; /// Total elapsed time in nanoseconds uint64 total_ns = 5; + /// Concatenate elapsed time in nanoseconds + optional uint64 concat_ns = 6; } message DecodeRequest { diff --git a/router/src/config.rs b/router/src/config.rs index 1a20c40bb67..7139b923768 100644 --- a/router/src/config.rs +++ b/router/src/config.rs @@ -150,6 +150,7 @@ pub enum Config { Idefics2(Idefics2), Ssm, GptBigcode, + Granite, Santacoder, Bloom, Mpt, diff --git a/router/src/lib.rs b/router/src/lib.rs index b29c9395d91..7c40c7e3dd6 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -8,6 +8,7 @@ pub mod validation; mod kserve; pub mod logging; +mod sagemaker; pub mod usage_stats; mod vertex; @@ -18,45 +19,6 @@ use tracing::warn; use utoipa::ToSchema; use validation::Validation; -#[derive(PartialEq)] -pub enum Attention { - Paged, - FlashDecoding, - FlashInfer, -} - -impl Attention { - pub fn block_size(&self) -> u32 { - match self { - Attention::FlashDecoding => 256, - Attention::FlashInfer => 1, - Attention::Paged => 16, - } - } -} - -#[derive(Debug)] -pub struct ParseError; - -impl std::fmt::Display for ParseError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Cannot parse attention value") - } -} -impl std::error::Error for ParseError {} - -impl std::str::FromStr for Attention { - type Err = ParseError; - fn from_str(s: &str) -> Result { - match s { - "paged" => Ok(Attention::Paged), - "flashdecoding" => Ok(Attention::FlashDecoding), - "flashinfer" => Ok(Attention::FlashInfer), - _ => Err(ParseError), - } - } -} - /// Hub type #[derive(Clone, Debug, Deserialize)] pub struct HubModelInfo { diff --git a/router/src/main.rs.back b/router/src/main.rs.back deleted file mode 100644 index 36879aa478b..00000000000 --- a/router/src/main.rs.back +++ /dev/null @@ -1,748 +0,0 @@ -use axum::http::HeaderValue; -use clap::Parser; -use clap::Subcommand; -use hf_hub::api::tokio::{Api, ApiBuilder, ApiRepo}; -use hf_hub::{Cache, Repo, RepoType}; -use opentelemetry::sdk::propagation::TraceContextPropagator; -use opentelemetry::sdk::trace; -use opentelemetry::sdk::trace::Sampler; -use opentelemetry::sdk::Resource; -use opentelemetry::{global, KeyValue}; -use opentelemetry_otlp::WithExportConfig; -use std::fs::File; -use std::io::BufReader; -use std::net::{IpAddr, Ipv4Addr, SocketAddr}; -use std::path::{Path, PathBuf}; -use text_generation_router::config::Config; -use text_generation_router::usage_stats; -use text_generation_router::{ - server, HubModelInfo, HubPreprocessorConfig, HubProcessorConfig, HubTokenizerConfig, -}; -use thiserror::Error; -use tokenizers::{processors::template::TemplateProcessing, Tokenizer}; -use tower_http::cors::AllowOrigin; -use tracing_subscriber::layer::SubscriberExt; -use tracing_subscriber::util::SubscriberInitExt; -use tracing_subscriber::{filter::LevelFilter, EnvFilter, Layer}; - -/// App Configuration -#[derive(Parser, Debug)] -#[clap(author, version, about, long_about = None)] -struct Args { - #[command(subcommand)] - command: Option, - - #[clap(default_value = "128", long, env)] - max_concurrent_requests: usize, - #[clap(default_value = "2", long, env)] - max_best_of: usize, - #[clap(default_value = "4", long, env)] - max_stop_sequences: usize, - #[clap(default_value = "5", long, env)] - max_top_n_tokens: u32, - #[clap(default_value = "1024", long, env)] - max_input_tokens: usize, - #[clap(default_value = "2048", long, env)] - max_total_tokens: usize, - #[clap(default_value = "1.2", long, env)] - waiting_served_ratio: f32, - #[clap(default_value = "4096", long, env)] - max_batch_prefill_tokens: u32, - #[clap(long, env)] - max_batch_total_tokens: Option, - #[clap(default_value = "20", long, env)] - max_waiting_tokens: usize, - #[clap(long, env)] - max_batch_size: Option, - #[clap(default_value = "0.0.0.0", long, env)] - hostname: String, - #[clap(default_value = "3000", long, short, env)] - port: u16, - #[clap(default_value = "/tmp/text-generation-server-0", long, env)] - master_shard_uds_path: String, - #[clap(default_value = "bigscience/bloom", long, env)] - tokenizer_name: String, - #[clap(long, env)] - tokenizer_config_path: Option, - #[clap(long, env)] - revision: Option, - #[clap(default_value = "2", long, env)] - validation_workers: usize, - #[clap(long, env)] - json_output: bool, - #[clap(long, env)] - otlp_endpoint: Option, - #[clap(default_value = "text-generation-inference.router", long, env)] - otlp_service_name: String, - #[clap(long, env)] - cors_allow_origin: Option>, - #[clap(long, env)] - api_key: Option, - #[clap(long, env)] - ngrok: bool, - #[clap(long, env)] - ngrok_authtoken: Option, - #[clap(long, env)] - ngrok_edge: Option, - #[clap(long, env, default_value_t = false)] - messages_api_enabled: bool, - #[clap(long, env, default_value_t = false)] - disable_grammar_support: bool, - #[clap(default_value = "4", long, env)] - max_client_batch_size: usize, - #[clap(long, env, default_value_t)] - disable_usage_stats: bool, - #[clap(long, env, default_value_t)] - disable_crash_reports: bool, -} - -#[derive(Debug, Subcommand)] -enum Commands { - PrintSchema, -} - -#[tokio::main] -async fn main() -> Result<(), RouterError> { - let args = Args::parse(); - - // Pattern match configuration - let Args { - max_concurrent_requests, - max_best_of, - max_stop_sequences, - max_top_n_tokens, - max_input_tokens, - max_total_tokens, - waiting_served_ratio, - max_batch_prefill_tokens, - max_batch_total_tokens, - max_waiting_tokens, - max_batch_size, - hostname, - port, - master_shard_uds_path, - tokenizer_name, - tokenizer_config_path, - revision, - validation_workers, - json_output, - otlp_endpoint, - otlp_service_name, - cors_allow_origin, - api_key, - ngrok, - ngrok_authtoken, - ngrok_edge, - messages_api_enabled, - disable_grammar_support, - max_client_batch_size, - disable_usage_stats, - disable_crash_reports, - command, - } = args; - - let print_schema_command = match command { - Some(Commands::PrintSchema) => true, - None => { - // only init logging if we are not running the print schema command - init_logging(otlp_endpoint, otlp_service_name, json_output); - false - } - }; - - // Validate args - if max_input_tokens >= max_total_tokens { - return Err(RouterError::ArgumentValidation( - "`max_input_tokens` must be < `max_total_tokens`".to_string(), - )); - } - if max_input_tokens as u32 > max_batch_prefill_tokens { - return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}"))); - } - - if validation_workers == 0 { - return Err(RouterError::ArgumentValidation( - "`validation_workers` must be > 0".to_string(), - )); - } - - if let Some(ref max_batch_total_tokens) = max_batch_total_tokens { - if max_batch_prefill_tokens > *max_batch_total_tokens { - return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}"))); - } - if max_total_tokens as u32 > *max_batch_total_tokens { - return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}"))); - } - } - - // CORS allowed origins - // map to go inside the option and then map to parse from String to HeaderValue - // Finally, convert to AllowOrigin - let cors_allow_origin: Option = cors_allow_origin.map(|cors_allow_origin| { - AllowOrigin::list( - cors_allow_origin - .iter() - .map(|origin| origin.parse::().unwrap()), - ) - }); - - // Parse Huggingface hub token - let authorization_token = std::env::var("HF_TOKEN") - .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) - .ok(); - - // Tokenizer instance - // This will only be used to validate payloads - let local_path = Path::new(&tokenizer_name); - - // Shared API builder initialization - let api_builder = || { - let mut builder = ApiBuilder::new() - .with_progress(false) - .with_token(authorization_token); - - if let Ok(cache_dir) = std::env::var("HUGGINGFACE_HUB_CACHE") { - builder = builder.with_cache_dir(cache_dir.into()); - } - - builder - }; - - // Decide if we need to use the API based on the revision and local path - let use_api = revision.is_some() || !local_path.exists() || !local_path.is_dir(); - - // Initialize API if needed - #[derive(Clone)] - enum Type { - Api(Api), - Cache(Cache), - None, - } - let api = if use_api { - if std::env::var("HF_HUB_OFFLINE") == Ok("1".to_string()) { - let cache = std::env::var("HUGGINGFACE_HUB_CACHE") - .map_err(|_| ()) - .map(|cache_dir| Cache::new(cache_dir.into())) - .unwrap_or_else(|_| Cache::default()); - - tracing::warn!("Offline mode active using cache defaults"); - Type::Cache(cache) - } else { - tracing::info!("Using the Hugging Face API"); - match api_builder().build() { - Ok(api) => Type::Api(api), - Err(_) => { - tracing::warn!("Unable to build the Hugging Face API"); - Type::None - } - } - } - } else { - Type::None - }; - - // Load tokenizer and model info - let ( - tokenizer_filename, - config_filename, - tokenizer_config_filename, - preprocessor_config_filename, - processor_config_filename, - model_info, - ) = match api { - Type::None => ( - Some(local_path.join("tokenizer.json")), - Some(local_path.join("config.json")), - Some(local_path.join("tokenizer_config.json")), - Some(local_path.join("preprocessor_config.json")), - Some(local_path.join("processor_config.json")), - None, - ), - Type::Api(api) => { - let api_repo = api.repo(Repo::with_revision( - tokenizer_name.to_string(), - RepoType::Model, - revision.clone().unwrap_or_else(|| "main".to_string()), - )); - - let tokenizer_filename = match api_repo.get("tokenizer.json").await { - Ok(tokenizer_filename) => Some(tokenizer_filename), - Err(_) => get_base_tokenizer(&api, &api_repo).await, - }; - let config_filename = api_repo.get("config.json").await.ok(); - let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok(); - let preprocessor_config_filename = api_repo.get("preprocessor_config.json").await.ok(); - let processor_config_filename = api_repo.get("processor_config.json").await.ok(); - - let model_info = if let Some(model_info) = get_model_info(&api_repo).await { - Some(model_info) - } else { - tracing::warn!("Could not retrieve model info from the Hugging Face hub."); - None - }; - ( - tokenizer_filename, - config_filename, - tokenizer_config_filename, - preprocessor_config_filename, - processor_config_filename, - model_info, - ) - } - Type::Cache(cache) => { - let repo = cache.repo(Repo::with_revision( - tokenizer_name.to_string(), - RepoType::Model, - revision.clone().unwrap_or_else(|| "main".to_string()), - )); - ( - repo.get("tokenizer.json"), - repo.get("config.json"), - repo.get("tokenizer_config.json"), - repo.get("preprocessor_config.json"), - repo.get("processor_config.json"), - None, - ) - } - }; - let config: Option = config_filename.and_then(|filename| { - std::fs::read_to_string(filename) - .ok() - .as_ref() - .and_then(|c| { - let config: Result = serde_json::from_str(c); - if let Err(err) = &config { - tracing::warn!("Could not parse config {err:?}"); - } - config.ok() - }) - }); - let model_info = model_info.unwrap_or_else(|| HubModelInfo { - model_id: tokenizer_name.to_string(), - sha: None, - pipeline_tag: None, - }); - - // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'. - let tokenizer_config: Option = if let Some(filename) = tokenizer_config_path - { - HubTokenizerConfig::from_file(filename) - } else { - tokenizer_config_filename.and_then(HubTokenizerConfig::from_file) - }; - let tokenizer_config = tokenizer_config.unwrap_or_else(|| { - tracing::warn!("Could not find tokenizer config locally and no API specified"); - HubTokenizerConfig::default() - }); - let tokenizer_class = tokenizer_config.tokenizer_class.clone(); - - let tokenizer: Option = tokenizer_filename.and_then(|filename| { - let mut tokenizer = Tokenizer::from_file(filename).ok(); - if let Some(tokenizer) = &mut tokenizer { - if let Some(class) = &tokenizer_config.tokenizer_class { - if class == "LlamaTokenizer" || class == "LlamaTokenizerFast"{ - if let Ok(post_processor) = create_post_processor(tokenizer, &tokenizer_config) { - tracing::info!("Overriding LlamaTokenizer with TemplateProcessing to follow python override defined in https://github.com/huggingface/transformers/blob/4aa17d00690b7f82c95bb2949ea57e22c35b4336/src/transformers/models/llama/tokenization_llama_fast.py#L203-L205"); - tokenizer.with_post_processor(post_processor); - } - } - } - } - tokenizer - }); - - let preprocessor_config = - preprocessor_config_filename.and_then(HubPreprocessorConfig::from_file); - let processor_config = processor_config_filename - .and_then(HubProcessorConfig::from_file) - .unwrap_or_default(); - - tracing::info!("Using config {config:?}"); - if tokenizer.is_none() { - tracing::warn!("Could not find a fast tokenizer implementation for {tokenizer_name}"); - tracing::warn!("Rust input length validation and truncation is disabled"); - } - - // if pipeline-tag == text-generation we default to return_full_text = true - let compat_return_full_text = match &model_info.pipeline_tag { - None => { - tracing::warn!("no pipeline tag found for model {tokenizer_name}"); - true - } - Some(pipeline_tag) => pipeline_tag.as_str() == "text-generation", - }; - - // Determine the server port based on the feature and environment variable. - let port = if cfg!(feature = "google") { - std::env::var("AIP_HTTP_PORT") - .map(|aip_http_port| aip_http_port.parse::().unwrap_or(port)) - .unwrap_or(port) - } else { - port - }; - - let addr = match hostname.parse() { - Ok(ip) => SocketAddr::new(ip, port), - Err(_) => { - tracing::warn!("Invalid hostname, defaulting to 0.0.0.0"); - SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), port) - } - }; - - // Only send usage stats when TGI is run in container and the function returns Some - let is_container = matches!(usage_stats::is_container(), Ok(true)); - - let user_agent = if !disable_usage_stats && is_container { - let reduced_args = usage_stats::Args::new( - config.clone(), - tokenizer_class, - max_concurrent_requests, - max_best_of, - max_stop_sequences, - max_top_n_tokens, - max_input_tokens, - max_total_tokens, - waiting_served_ratio, - max_batch_prefill_tokens, - max_batch_total_tokens, - max_waiting_tokens, - max_batch_size, - revision, - validation_workers, - messages_api_enabled, - disable_grammar_support, - max_client_batch_size, - disable_usage_stats, - disable_crash_reports, - ); - Some(usage_stats::UserAgent::new(reduced_args)) - } else { - None - }; - - if let Some(ref ua) = user_agent { - let start_event = - usage_stats::UsageStatsEvent::new(ua.clone(), usage_stats::EventType::Start, None); - tokio::spawn(async move { - start_event.send().await; - }); - }; - - // Run server - let result = server::run( - master_shard_uds_path, - model_info, - compat_return_full_text, - max_concurrent_requests, - max_best_of, - max_stop_sequences, - max_top_n_tokens, - max_input_tokens, - max_total_tokens, - waiting_served_ratio, - max_batch_prefill_tokens, - max_batch_total_tokens, - max_waiting_tokens, - max_batch_size, - tokenizer, - config, - validation_workers, - addr, - cors_allow_origin, - api_key, - ngrok, - ngrok_authtoken, - ngrok_edge, - tokenizer_config, - preprocessor_config, - processor_config, - messages_api_enabled, - disable_grammar_support, - max_client_batch_size, - print_schema_command, - ) - .await; - - match result { - Ok(_) => { - if let Some(ref ua) = user_agent { - let stop_event = usage_stats::UsageStatsEvent::new( - ua.clone(), - usage_stats::EventType::Stop, - None, - ); - stop_event.send().await; - }; - Ok(()) - } - Err(e) => { - if let Some(ref ua) = user_agent { - if !disable_crash_reports { - let error_event = usage_stats::UsageStatsEvent::new( - ua.clone(), - usage_stats::EventType::Error, - Some(e.to_string()), - ); - error_event.send().await; - } else { - let unknow_error_event = usage_stats::UsageStatsEvent::new( - ua.clone(), - usage_stats::EventType::Error, - Some("unknow_error".to_string()), - ); - unknow_error_event.send().await; - } - }; - Err(RouterError::WebServer(e)) - } - } -} - -/// Init logging using env variables LOG_LEVEL and LOG_FORMAT: -/// - otlp_endpoint is an optional URL to an Open Telemetry collector -/// - otlp_service_name service name to appear in APM -/// - LOG_LEVEL may be TRACE, DEBUG, INFO, WARN or ERROR (default to INFO) -/// - LOG_FORMAT may be TEXT or JSON (default to TEXT) -/// - LOG_COLORIZE may be "false" or "true" (default to "true" or ansi supported platforms) -fn init_logging(otlp_endpoint: Option, otlp_service_name: String, json_output: bool) { - let mut layers = Vec::new(); - - // STDOUT/STDERR layer - let ansi = std::env::var("LOG_COLORIZE") != Ok("1".to_string()); - let fmt_layer = tracing_subscriber::fmt::layer() - .with_file(true) - .with_ansi(ansi) - .with_line_number(true); - - let fmt_layer = match json_output { - true => fmt_layer.json().flatten_event(true).boxed(), - false => fmt_layer.boxed(), - }; - layers.push(fmt_layer); - - // OpenTelemetry tracing layer - if let Some(otlp_endpoint) = otlp_endpoint { - global::set_text_map_propagator(TraceContextPropagator::new()); - - let tracer = opentelemetry_otlp::new_pipeline() - .tracing() - .with_exporter( - opentelemetry_otlp::new_exporter() - .tonic() - .with_endpoint(otlp_endpoint), - ) - .with_trace_config( - trace::config() - .with_resource(Resource::new(vec![KeyValue::new( - "service.name", - otlp_service_name, - )])) - .with_sampler(Sampler::AlwaysOn), - ) - .install_batch(opentelemetry::runtime::Tokio); - - if let Ok(tracer) = tracer { - layers.push(tracing_opentelemetry::layer().with_tracer(tracer).boxed()); - init_tracing_opentelemetry::init_propagator().unwrap(); - }; - } - - // Filter events with LOG_LEVEL - let varname = "LOG_LEVEL"; - let env_filter = if let Ok(log_level) = std::env::var(varname) { - // Override to avoid simple logs to be spammed with tokio level informations - let log_level = match &log_level[..] { - "warn" => "text_generation_launcher=warn,text_generation_router=warn", - "info" => "text_generation_launcher=info,text_generation_router=info", - "debug" => "text_generation_launcher=debug,text_generation_router=debug", - log_level => log_level, - }; - EnvFilter::builder() - .with_default_directive(LevelFilter::INFO.into()) - .parse_lossy(log_level) - } else { - EnvFilter::new("info") - }; - - tracing_subscriber::registry() - .with(env_filter) - .with(layers) - .init(); -} - -/// get model info from the Huggingface Hub -pub async fn get_model_info(api: &ApiRepo) -> Option { - let response = api.info_request().send().await.ok()?; - - if response.status().is_success() { - let hub_model_info: HubModelInfo = - serde_json::from_str(&response.text().await.ok()?).ok()?; - if let Some(sha) = &hub_model_info.sha { - tracing::info!( - "Serving revision {sha} of model {}", - hub_model_info.model_id - ); - } - Some(hub_model_info) - } else { - None - } -} - -/// get base tokenizer -pub async fn get_base_tokenizer(api: &Api, api_repo: &ApiRepo) -> Option { - let config_filename = api_repo.get("config.json").await.ok()?; - - // Open the file in read-only mode with buffer. - let file = File::open(config_filename).ok()?; - let reader = BufReader::new(file); - - // Read the JSON contents of the file as an instance of `User`. - let config: serde_json::Value = serde_json::from_reader(reader).ok()?; - - if let Some(serde_json::Value::String(base_model_id)) = config.get("base_model_name_or_path") { - let api_base_repo = api.repo(Repo::with_revision( - base_model_id.to_string(), - RepoType::Model, - "main".to_string(), - )); - - api_base_repo.get("tokenizer.json").await.ok() - } else { - None - } -} - -/// get tokenizer_config from the Huggingface Hub -pub async fn get_tokenizer_config(api_repo: &ApiRepo) -> Option { - let tokenizer_config_filename = api_repo.get("tokenizer_config.json").await.ok()?; - - // Open the file in read-only mode with buffer. - let file = File::open(tokenizer_config_filename).ok()?; - let reader = BufReader::new(file); - - // Read the JSON contents of the file as an instance of 'HubTokenizerConfig'. - let tokenizer_config: HubTokenizerConfig = serde_json::from_reader(reader) - .map_err(|e| { - tracing::warn!("Unable to parse tokenizer config: {}", e); - e - }) - .ok()?; - - Some(tokenizer_config) -} - -/// Create a post_processor for the LlamaTokenizer -pub fn create_post_processor( - tokenizer: &Tokenizer, - tokenizer_config: &HubTokenizerConfig, -) -> Result { - let add_bos_token = tokenizer_config.add_bos_token.unwrap_or(true); - let add_eos_token = tokenizer_config.add_eos_token.unwrap_or(false); - - let bos_token = tokenizer_config.bos_token.as_ref(); - let eos_token = tokenizer_config.eos_token.as_ref(); - - if add_bos_token && bos_token.is_none() { - panic!("add_bos_token = true but bos_token is None"); - } - - if add_eos_token && eos_token.is_none() { - panic!("add_eos_token = true but eos_token is None"); - } - - let mut single = Vec::new(); - let mut pair = Vec::new(); - let mut special_tokens = Vec::new(); - - if add_bos_token { - if let Some(bos) = bos_token { - let bos_token_id = tokenizer - .token_to_id(bos.as_str()) - .expect("Should have found the bos token id"); - special_tokens.push((bos.as_str(), bos_token_id)); - single.push(format!("{}:0", bos.as_str())); - pair.push(format!("{}:0", bos.as_str())); - } - } - - single.push("$A:0".to_string()); - pair.push("$A:0".to_string()); - - if add_eos_token { - if let Some(eos) = eos_token { - let eos_token_id = tokenizer - .token_to_id(eos.as_str()) - .expect("Should have found the eos token id"); - special_tokens.push((eos.as_str(), eos_token_id)); - single.push(format!("{}:0", eos.as_str())); - pair.push(format!("{}:0", eos.as_str())); - } - } - - if add_bos_token { - if let Some(bos) = bos_token { - pair.push(format!("{}:1", bos.as_str())); - } - } - - pair.push("$B:1".to_string()); - - if add_eos_token { - if let Some(eos) = eos_token { - pair.push(format!("{}:1", eos.as_str())); - } - } - - let post_processor = TemplateProcessing::builder() - .try_single(single)? - .try_pair(pair)? - .special_tokens(special_tokens) - .build()?; - - Ok(post_processor) -} - -#[derive(Debug, Error)] -enum RouterError { - #[error("Argument validation error: {0}")] - ArgumentValidation(String), - #[error("WebServer error: {0}")] - WebServer(#[from] server::WebServerError), - #[error("Tokio runtime failed to start: {0}")] - Tokio(#[from] std::io::Error), -} - -#[cfg(test)] -mod tests { - use super::*; - use text_generation_router::TokenizerConfigToken; - - #[test] - fn test_create_post_processor() { - let tokenizer_config = HubTokenizerConfig { - add_bos_token: None, - add_eos_token: None, - bos_token: Some(TokenizerConfigToken::String("".to_string())), - eos_token: Some(TokenizerConfigToken::String("".to_string())), - chat_template: None, - tokenizer_class: None, - completion_template: None, - }; - - let tokenizer = - Tokenizer::from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", None).unwrap(); - let post_processor = create_post_processor(&tokenizer, &tokenizer_config).unwrap(); - - let expected = TemplateProcessing::builder() - .try_single(":0 $A:0") - .unwrap() - .try_pair(":0 $A:0 :1 $B:1") - .unwrap() - .special_tokens(vec![("".to_string(), 1)]) - .build() - .unwrap(); - - assert_eq!(post_processor, expected); - } -} diff --git a/router/src/sagemaker.rs b/router/src/sagemaker.rs new file mode 100644 index 00000000000..750ef222bb5 --- /dev/null +++ b/router/src/sagemaker.rs @@ -0,0 +1,82 @@ +use crate::infer::Infer; +use crate::server::{chat_completions, compat_generate, completions, ComputeType}; +use crate::{ + ChatCompletion, ChatCompletionChunk, ChatRequest, Chunk, CompatGenerateRequest, + CompletionFinal, CompletionRequest, ErrorResponse, GenerateResponse, Info, StreamResponse, +}; +use axum::extract::Extension; +use axum::http::StatusCode; +use axum::response::Response; +use axum::Json; +use serde::{Deserialize, Serialize}; +use tracing::instrument; +use utoipa::ToSchema; + +#[derive(Clone, Deserialize, ToSchema)] +#[serde(untagged)] +pub(crate) enum SagemakerRequest { + Generate(CompatGenerateRequest), + Chat(ChatRequest), + Completion(CompletionRequest), +} + +// Used for OpenAPI specs +#[allow(dead_code)] +#[derive(Serialize, ToSchema)] +#[serde(untagged)] +pub(crate) enum SagemakerResponse { + Generate(GenerateResponse), + Chat(ChatCompletion), + Completion(CompletionFinal), +} + +// Used for OpenAPI specs +#[allow(dead_code)] +#[derive(Serialize, ToSchema)] +#[serde(untagged)] +pub(crate) enum SagemakerStreamResponse { + Generate(StreamResponse), + Chat(ChatCompletionChunk), + Completion(Chunk), +} + +/// Generate tokens from Sagemaker request +#[utoipa::path( +post, +tag = "Text Generation Inference", +path = "/invocations", +request_body = SagemakerRequest, +responses( +(status = 200, description = "Generated Chat Completion", +content( +("application/json" = SagemakerResponse), +("text/event-stream" = SagemakerStreamResponse), +)), +(status = 424, description = "Generation Error", body = ErrorResponse, +example = json ! ({"error": "Request failed during generation", "error_type": "generation"})), +(status = 429, description = "Model is overloaded", body = ErrorResponse, +example = json ! ({"error": "Model is overloaded", "error_type": "overloaded"})), +(status = 422, description = "Input validation error", body = ErrorResponse, +example = json ! ({"error": "Input validation error", "error_type": "validation"})), +(status = 500, description = "Incomplete generation", body = ErrorResponse, +example = json ! ({"error": "Incomplete generation", "error_type": "incomplete_generation"})), +) +)] +#[instrument(skip_all)] +pub(crate) async fn sagemaker_compatibility( + default_return_full_text: Extension, + infer: Extension, + compute_type: Extension, + info: Extension, + Json(req): Json, +) -> Result)> { + match req { + SagemakerRequest::Generate(req) => { + compat_generate(default_return_full_text, infer, compute_type, Json(req)).await + } + SagemakerRequest::Chat(req) => chat_completions(infer, compute_type, info, Json(req)).await, + SagemakerRequest::Completion(req) => { + completions(infer, compute_type, info, Json(req)).await + } + } +} diff --git a/router/src/server.rs b/router/src/server.rs index 3a45851b2f1..bedcc7560db 100644 --- a/router/src/server.rs +++ b/router/src/server.rs @@ -8,6 +8,10 @@ use crate::kserve::{ kserve_model_metadata, kserve_model_metadata_ready, }; use crate::logging::trace_context_middleware; +use crate::sagemaker::{ + sagemaker_compatibility, SagemakerRequest, SagemakerResponse, SagemakerStreamResponse, + __path_sagemaker_compatibility, +}; use crate::validation::ValidationError; use crate::vertex::vertex_compatibility; use crate::ChatTokenizeResponse; @@ -85,7 +89,7 @@ example = json ! ({"error": "Incomplete generation"})), ) )] #[instrument(skip(infer, req))] -async fn compat_generate( +pub(crate) async fn compat_generate( Extension(default_return_full_text): Extension, infer: Extension, compute_type: Extension, @@ -694,7 +698,7 @@ time_per_token, seed, ) )] -async fn completions( +pub(crate) async fn completions( Extension(infer): Extension, Extension(compute_type): Extension, Extension(info): Extension, @@ -1223,7 +1227,7 @@ time_per_token, seed, ) )] -async fn chat_completions( +pub(crate) async fn chat_completions( Extension(infer): Extension, Extension(compute_type): Extension, Extension(info): Extension, @@ -1539,11 +1543,13 @@ completions, tokenize, metrics, openai_get_model_info, +sagemaker_compatibility, ), components( schemas( Info, CompatGenerateRequest, +SagemakerRequest, GenerateRequest, GrammarType, ChatRequest, @@ -1566,6 +1572,8 @@ ChatCompletionTopLogprob, ChatCompletion, CompletionRequest, CompletionComplete, +SagemakerResponse, +SagemakerStreamResponse, Chunk, Completion, CompletionFinal, @@ -1627,13 +1635,13 @@ pub async fn run( tokenizer_name: String, tokenizer_config_path: Option, revision: Option, + trust_remote_code: bool, hostname: String, port: u16, cors_allow_origin: Option>, ngrok: bool, _ngrok_authtoken: Option, _ngrok_edge: Option, - messages_api_enabled: bool, disable_grammar_support: bool, max_client_batch_size: usize, usage_stats_level: usage_stats::UsageStatsLevel, @@ -1787,10 +1795,13 @@ pub async fn run( let auto = transformers.getattr("AutoTokenizer")?; let from_pretrained = auto.getattr("from_pretrained")?; let args = (tokenizer_name.to_string(),); - let kwargs = [( - "revision", - revision.clone().unwrap_or_else(|| "main".to_string()), - )] + let kwargs = [ + ( + "revision", + (revision.clone().unwrap_or_else(|| "main".to_string())).into_py(py), + ), + ("trust_remote_code", trust_remote_code.into_py(py)), + ] .into_py_dict_bound(py); let tokenizer = from_pretrained.call(args, Some(&kwargs))?; let save = tokenizer.getattr("save_pretrained")?; @@ -1862,7 +1873,6 @@ pub async fn run( // max_batch_size, revision.clone(), validation_workers, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats_level, @@ -1904,7 +1914,6 @@ pub async fn run( ngrok, _ngrok_authtoken, _ngrok_edge, - messages_api_enabled, disable_grammar_support, max_client_batch_size, model_info, @@ -1964,7 +1973,6 @@ async fn start( ngrok: bool, _ngrok_authtoken: Option, _ngrok_edge: Option, - messages_api_enabled: bool, disable_grammar_support: bool, max_client_batch_size: usize, model_info: HubModelInfo, @@ -2279,6 +2287,7 @@ async fn start( .route("/v1/chat/completions", post(chat_completions)) .route("/v1/completions", post(completions)) .route("/vertex", post(vertex_compatibility)) + .route("/invocations", post(sagemaker_compatibility)) .route("/tokenize", post(tokenize)); if let Some(api_key) = api_key { @@ -2314,13 +2323,6 @@ async fn start( .route("/metrics", get(metrics)) .route("/v1/models", get(openai_get_model_info)); - // Conditional AWS Sagemaker route - let aws_sagemaker_route = if messages_api_enabled { - Router::new().route("/invocations", post(chat_completions)) // Use 'chat_completions' for OAI_ENABLED - } else { - Router::new().route("/invocations", post(compat_generate)) // Use 'compat_generate' otherwise - }; - let compute_type = ComputeType(std::env::var("COMPUTE_TYPE").unwrap_or("gpu+optimized".to_string())); @@ -2328,8 +2330,7 @@ async fn start( let mut app = Router::new() .merge(swagger_ui) .merge(base_routes) - .merge(info_routes) - .merge(aws_sagemaker_route); + .merge(info_routes); #[cfg(feature = "google")] { diff --git a/router/src/usage_stats.rs b/router/src/usage_stats.rs index 0282ac63492..e9d98327ef6 100644 --- a/router/src/usage_stats.rs +++ b/router/src/usage_stats.rs @@ -93,7 +93,6 @@ pub struct Args { // max_batch_size: Option, revision: Option, validation_workers: usize, - messages_api_enabled: bool, disable_grammar_support: bool, max_client_batch_size: usize, usage_stats_level: UsageStatsLevel, @@ -117,7 +116,6 @@ impl Args { // max_batch_size: Option, revision: Option, validation_workers: usize, - messages_api_enabled: bool, disable_grammar_support: bool, max_client_batch_size: usize, usage_stats_level: UsageStatsLevel, @@ -138,7 +136,6 @@ impl Args { // max_batch_size, revision, validation_workers, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats_level, diff --git a/router/src/vertex.rs b/router/src/vertex.rs index 03aed63c0ea..4cf5f5c73e4 100644 --- a/router/src/vertex.rs +++ b/router/src/vertex.rs @@ -1,9 +1,6 @@ use crate::infer::Infer; use crate::server::{generate_internal, ComputeType}; -use crate::{ - ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest, GrammarType, Message, - StreamOptions, Tool, ToolChoice, -}; +use crate::{ChatRequest, ErrorResponse, GenerateParameters, GenerateRequest}; use axum::extract::Extension; use axum::http::{HeaderMap, StatusCode}; use axum::response::{IntoResponse, Response}; @@ -22,162 +19,12 @@ pub(crate) struct GenerateVertexInstance { pub parameters: Option, } -#[derive(Clone, Deserialize, ToSchema)] -#[cfg_attr(test, derive(Debug, PartialEq))] -pub(crate) struct VertexChat { - messages: Vec, - // Messages is ignored there. - #[serde(default)] - parameters: VertexParameters, -} - -#[derive(Clone, Deserialize, ToSchema, Serialize, Default)] -#[cfg_attr(test, derive(Debug, PartialEq))] -pub(crate) struct VertexParameters { - #[schema(example = "mistralai/Mistral-7B-Instruct-v0.2")] - /// [UNUSED] ID of the model to use. See the model endpoint compatibility table for details on which models work with the Chat API. - pub model: Option, - - /// Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, - /// decreasing the model's likelihood to repeat the same line verbatim. - #[serde(default)] - #[schema(example = "1.0")] - pub frequency_penalty: Option, - - /// UNUSED - /// Modify the likelihood of specified tokens appearing in the completion. Accepts a JSON object that maps tokens - /// (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, - /// the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, - /// but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should - /// result in a ban or exclusive selection of the relevant token. - #[serde(default)] - pub logit_bias: Option>, - - /// Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each - /// output token returned in the content of message. - #[serde(default)] - #[schema(example = "false")] - pub logprobs: Option, - - /// An integer between 0 and 5 specifying the number of most likely tokens to return at each token position, each with - /// an associated log probability. logprobs must be set to true if this parameter is used. - #[serde(default)] - #[schema(example = "5")] - pub top_logprobs: Option, - - /// The maximum number of tokens that can be generated in the chat completion. - #[serde(default)] - #[schema(example = "32")] - pub max_tokens: Option, - - /// UNUSED - /// How many chat completion choices to generate for each input message. Note that you will be charged based on the - /// number of generated tokens across all of the choices. Keep n as 1 to minimize costs. - #[serde(default)] - #[schema(nullable = true, example = "2")] - pub n: Option, - - /// Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, - /// increasing the model's likelihood to talk about new topics - #[serde(default)] - #[schema(nullable = true, example = 0.1)] - pub presence_penalty: Option, - - /// Up to 4 sequences where the API will stop generating further tokens. - #[serde(default)] - #[schema(nullable = true, example = "null")] - pub stop: Option>, - - #[serde(default = "bool::default")] - pub stream: bool, - - #[schema(nullable = true, example = 42)] - pub seed: Option, - - /// What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while - /// lower values like 0.2 will make it more focused and deterministic. - /// - /// We generally recommend altering this or `top_p` but not both. - #[serde(default)] - #[schema(nullable = true, example = 1.0)] - pub temperature: Option, - - /// An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the - /// tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered. - #[serde(default)] - #[schema(nullable = true, example = 0.95)] - pub top_p: Option, - - /// A list of tools the model may call. Currently, only functions are supported as a tool. Use this to provide a list of - /// functions the model may generate JSON inputs for. - #[serde(default)] - #[schema(nullable = true, example = "null")] - pub tools: Option>, - - /// A prompt to be appended before the tools - #[serde(default)] - #[schema( - nullable = true, - example = "Given the functions available, please respond with a JSON for a function call with its proper arguments that best answers the given prompt. Respond in the format {name: function name, parameters: dictionary of argument name and its value}.Do not use variables." - )] - pub tool_prompt: Option, - - /// A specific tool to use. If not provided, the model will default to use any of the tools provided in the tools parameter. - #[serde(default)] - #[schema(nullable = true, example = "null")] - pub tool_choice: ToolChoice, - - /// Response format constraints for the generation. - /// - /// NOTE: A request can use `response_format` OR `tools` but not both. - #[serde(default)] - #[schema(nullable = true, default = "null", example = "null")] - pub response_format: Option, - - /// A guideline to be used in the chat_template - #[serde(default)] - #[schema(nullable = true, default = "null", example = "null")] - pub guideline: Option, - - /// Options for streaming response. Only set this when you set stream: true. - #[serde(default)] - #[schema(nullable = true, example = "null")] - pub stream_options: Option, -} - -impl From for ChatRequest { - fn from(val: VertexChat) -> Self { - Self { - messages: val.messages, - frequency_penalty: val.parameters.frequency_penalty, - guideline: val.parameters.guideline, - logit_bias: val.parameters.logit_bias, - logprobs: val.parameters.logprobs, - max_tokens: val.parameters.max_tokens, - model: val.parameters.model, - n: val.parameters.n, - presence_penalty: val.parameters.presence_penalty, - response_format: val.parameters.response_format, - seed: val.parameters.seed, - stop: val.parameters.stop, - stream_options: val.parameters.stream_options, - stream: val.parameters.stream, - temperature: val.parameters.temperature, - tool_choice: val.parameters.tool_choice, - tool_prompt: val.parameters.tool_prompt, - tools: val.parameters.tools, - top_logprobs: val.parameters.top_logprobs, - top_p: val.parameters.top_p, - } - } -} - #[derive(Clone, Deserialize, ToSchema)] #[cfg_attr(test, derive(Debug, PartialEq))] #[serde(untagged)] pub(crate) enum VertexInstance { Generate(GenerateVertexInstance), - Chat(VertexChat), + Chat(ChatRequest), } #[derive(Deserialize, ToSchema)] @@ -263,9 +110,8 @@ pub(crate) async fn vertex_compatibility( }, }, VertexInstance::Chat(instance) => { - let chat_request: ChatRequest = instance.into(); let (generate_request, _using_tools): (GenerateRequest, bool) = - chat_request.try_into_generate(&infer)?; + instance.try_into_generate(&infer)?; generate_request } }; @@ -311,34 +157,14 @@ mod tests { #[test] fn vertex_deserialization() { - let string = serde_json::json!({ - - "messages": [{"role": "user", "content": "What's Deep Learning?"}], - "parameters": { - "max_tokens": 128, - "top_p": 0.95, - "temperature": 0.7 - } - }); - - let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize"); - - let string = serde_json::json!({ - "messages": [{"role": "user", "content": "What's Deep Learning?"}], - }); - - let _request: VertexChat = serde_json::from_value(string).expect("Can deserialize"); - let string = serde_json::json!({ "instances": [ { "messages": [{"role": "user", "content": "What's Deep Learning?"}], - "parameters": { - "max_tokens": 128, - "top_p": 0.95, - "temperature": 0.7 - } + "max_tokens": 128, + "top_p": 0.95, + "temperature": 0.7 } ] @@ -347,18 +173,16 @@ mod tests { assert_eq!( request, VertexRequest { - instances: vec![VertexInstance::Chat(VertexChat { + instances: vec![VertexInstance::Chat(ChatRequest { messages: vec![Message { role: "user".to_string(), content: MessageContent::SingleText("What's Deep Learning?".to_string()), name: None, },], - parameters: VertexParameters { - max_tokens: Some(128), - top_p: Some(0.95), - temperature: Some(0.7), - ..Default::default() - } + max_tokens: Some(128), + top_p: Some(0.95), + temperature: Some(0.7), + ..Default::default() })] } ); diff --git a/server/Makefile b/server/Makefile index 9338b299090..18424dd6d7e 100644 --- a/server/Makefile +++ b/server/Makefile @@ -31,7 +31,7 @@ install: install-cuda echo "Installed server" install-cuda: install-server install-flash-attention-v2-cuda install-vllm-cuda install-flash-attention install-fbgemm - pip install -e ".[bnb]" + pip install -e ".[bnb,marlin,moe]" pip install nvidia-nccl-cu12==2.22.3 install-rocm: install-server install-flash-attention-v2-rocm install-vllm-rocm diff --git a/server/poetry.lock b/server/poetry.lock index 08f74999f04..1293e883656 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "accelerate" @@ -32,113 +32,113 @@ testing = ["bitsandbytes", "datasets", "deepspeed", "evaluate", "parameterized", [[package]] name = "aiohappyeyeballs" -version = "2.4.0" +version = "2.4.3" description = "Happy Eyeballs for asyncio" optional = true python-versions = ">=3.8" files = [ - {file = "aiohappyeyeballs-2.4.0-py3-none-any.whl", hash = "sha256:7ce92076e249169a13c2f49320d1967425eaf1f407522d707d59cac7628d62bd"}, - {file = "aiohappyeyeballs-2.4.0.tar.gz", hash = "sha256:55a1714f084e63d49639800f95716da97a1f173d46a16dfcfda0016abb93b6b2"}, + {file = "aiohappyeyeballs-2.4.3-py3-none-any.whl", hash = "sha256:8a7a83727b2756f394ab2895ea0765a0a8c475e3c71e98d43d76f22b4b435572"}, + {file = "aiohappyeyeballs-2.4.3.tar.gz", hash = "sha256:75cf88a15106a5002a8eb1dab212525c00d1f4c0fa96e551c9fbe6f09a621586"}, ] [[package]] name = "aiohttp" -version = "3.10.6" +version = "3.10.10" description = "Async http client/server framework (asyncio)" optional = true python-versions = ">=3.8" files = [ - {file = "aiohttp-3.10.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:682836fc672972cc3101cc9e30d49c5f7e8f1d010478d46119fe725a4545acfd"}, - {file = "aiohttp-3.10.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:289fa8a20018d0d5aa9e4b35d899bd51bcb80f0d5f365d9a23e30dac3b79159b"}, - {file = "aiohttp-3.10.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8617c96a20dd57e7e9d398ff9d04f3d11c4d28b1767273a5b1a018ada5a654d3"}, - {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdbeff1b062751c2a2a55b171f7050fb7073633c699299d042e962aacdbe1a07"}, - {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7ea35d849cdd4a9268f910bff4497baebbc1aa3f2f625fd8ccd9ac99c860c621"}, - {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:473961b3252f3b949bb84873d6e268fb6d8aa0ccc6eb7404fa58c76a326bb8e1"}, - {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d2665c5df629eb2f981dab244c01bfa6cdc185f4ffa026639286c4d56fafb54"}, - {file = "aiohttp-3.10.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:25d92f794f1332f656e3765841fc2b7ad5c26c3f3d01e8949eeb3495691cf9f4"}, - {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9bd6b2033993d5ae80883bb29b83fb2b432270bbe067c2f53cc73bb57c46065f"}, - {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:d7f408c43f5e75ea1edc152fb375e8f46ef916f545fb66d4aebcbcfad05e2796"}, - {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:cf8b8560aa965f87bf9c13bf9fed7025993a155ca0ce8422da74bf46d18c2f5f"}, - {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14477c4e52e2f17437b99893fd220ffe7d7ee41df5ebf931a92b8ca82e6fd094"}, - {file = "aiohttp-3.10.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fb138fbf9f53928e779650f5ed26d0ea1ed8b2cab67f0ea5d63afa09fdc07593"}, - {file = "aiohttp-3.10.6-cp310-cp310-win32.whl", hash = "sha256:9843d683b8756971797be171ead21511d2215a2d6e3c899c6e3107fbbe826791"}, - {file = "aiohttp-3.10.6-cp310-cp310-win_amd64.whl", hash = "sha256:f8b8e49fe02f744d38352daca1dbef462c3874900bd8166516f6ea8e82b5aacf"}, - {file = "aiohttp-3.10.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f52e54fd776ad0da1006708762213b079b154644db54bcfc62f06eaa5b896402"}, - {file = "aiohttp-3.10.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:995ab1a238fd0d19dc65f2d222e5eb064e409665c6426a3e51d5101c1979ee84"}, - {file = "aiohttp-3.10.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0749c4d5a08a802dd66ecdf59b2df4d76b900004017468a7bb736c3b5a3dd902"}, - {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e05b39158f2af0e2438cc2075cfc271f4ace0c3cc4a81ec95b27a0432e161951"}, - {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a9f196c970db2dcde4f24317e06615363349dc357cf4d7a3b0716c20ac6d7bcd"}, - {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:47647c8af04a70e07a2462931b0eba63146a13affa697afb4ecbab9d03a480ce"}, - {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669c0efe7e99f6d94d63274c06344bd0e9c8daf184ce5602a29bc39e00a18720"}, - {file = "aiohttp-3.10.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9721cdd83a994225352ca84cd537760d41a9da3c0eacb3ff534747ab8fba6d0"}, - {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0b82c8ebed66ce182893e7c0b6b60ba2ace45b1df104feb52380edae266a4850"}, - {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:b169f8e755e541b72e714b89a831b315bbe70db44e33fead28516c9e13d5f931"}, - {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0be3115753baf8b4153e64f9aa7bf6c0c64af57979aa900c31f496301b374570"}, - {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e1f80cd17d81a404b6e70ef22bfe1870bafc511728397634ad5f5efc8698df56"}, - {file = "aiohttp-3.10.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6419728b08fb6380c66a470d2319cafcec554c81780e2114b7e150329b9a9a7f"}, - {file = "aiohttp-3.10.6-cp311-cp311-win32.whl", hash = "sha256:bd294dcdc1afdc510bb51d35444003f14e327572877d016d576ac3b9a5888a27"}, - {file = "aiohttp-3.10.6-cp311-cp311-win_amd64.whl", hash = "sha256:bf861da9a43d282d6dd9dcd64c23a0fccf2c5aa5cd7c32024513c8c79fb69de3"}, - {file = "aiohttp-3.10.6-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:2708baccdc62f4b1251e59c2aac725936a900081f079b88843dabcab0feeeb27"}, - {file = "aiohttp-3.10.6-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:7475da7a5e2ccf1a1c86c8fee241e277f4874c96564d06f726d8df8e77683ef7"}, - {file = "aiohttp-3.10.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:02108326574ff60267b7b35b17ac5c0bbd0008ccb942ce4c48b657bb90f0b8aa"}, - {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:029a019627b37fa9eac5c75cc54a6bb722c4ebbf5a54d8c8c0fb4dd8facf2702"}, - {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8a637d387db6fdad95e293fab5433b775fd104ae6348d2388beaaa60d08b38c4"}, - {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dc1a16f3fc1944c61290d33c88dc3f09ba62d159b284c38c5331868425aca426"}, - {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81b292f37969f9cc54f4643f0be7dacabf3612b3b4a65413661cf6c350226787"}, - {file = "aiohttp-3.10.6-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0754690a3a26e819173a34093798c155bafb21c3c640bff13be1afa1e9d421f9"}, - {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:164ecd32e65467d86843dbb121a6666c3deb23b460e3f8aefdcaacae79eb718a"}, - {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438c5863feb761f7ca3270d48c292c334814459f61cc12bab5ba5b702d7c9e56"}, - {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:ba18573bb1de1063d222f41de64a0d3741223982dcea863b3f74646faf618ec7"}, - {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:c82a94ddec996413a905f622f3da02c4359952aab8d817c01cf9915419525e95"}, - {file = "aiohttp-3.10.6-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:92351aa5363fc3c1f872ca763f86730ced32b01607f0c9662b1fa711087968d0"}, - {file = "aiohttp-3.10.6-cp312-cp312-win32.whl", hash = "sha256:3e15e33bfc73fa97c228f72e05e8795e163a693fd5323549f49367c76a6e5883"}, - {file = "aiohttp-3.10.6-cp312-cp312-win_amd64.whl", hash = "sha256:fe517113fe4d35d9072b826c3e147d63c5f808ca8167d450b4f96c520c8a1d8d"}, - {file = "aiohttp-3.10.6-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:482f74057ea13d387a7549d7a7ecb60e45146d15f3e58a2d93a0ad2d5a8457cd"}, - {file = "aiohttp-3.10.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:03fa40d1450ee5196e843315ddf74a51afc7e83d489dbfc380eecefea74158b1"}, - {file = "aiohttp-3.10.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e52e59ed5f4cc3a3acfe2a610f8891f216f486de54d95d6600a2c9ba1581f4d"}, - {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2b3935a22c9e41a8000d90588bed96cf395ef572dbb409be44c6219c61d900d"}, - {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4bef1480ee50f75abcfcb4b11c12de1005968ca9d0172aec4a5057ba9f2b644f"}, - {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:671745ea7db19693ce867359d503772177f0b20fa8f6ee1e74e00449f4c4151d"}, - {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b50b367308ca8c12e0b50cba5773bc9abe64c428d3fd2bbf5cd25aab37c77bf"}, - {file = "aiohttp-3.10.6-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a504d7cdb431a777d05a124fd0b21efb94498efa743103ea01b1e3136d2e4fb"}, - {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:66bc81361131763660b969132a22edce2c4d184978ba39614e8f8f95db5c95f8"}, - {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:27cf19a38506e2e9f12fc17e55f118f04897b0a78537055d93a9de4bf3022e3d"}, - {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3468b39f977a11271517c6925b226720e148311039a380cc9117b1e2258a721f"}, - {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:9d26da22a793dfd424be1050712a70c0afd96345245c29aced1e35dbace03413"}, - {file = "aiohttp-3.10.6-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:844d48ff9173d0b941abed8b2ea6a412f82b56d9ab1edb918c74000c15839362"}, - {file = "aiohttp-3.10.6-cp313-cp313-win32.whl", hash = "sha256:2dd56e3c43660ed3bea67fd4c5025f1ac1f9ecf6f0b991a6e5efe2e678c490c5"}, - {file = "aiohttp-3.10.6-cp313-cp313-win_amd64.whl", hash = "sha256:c91781d969fbced1993537f45efe1213bd6fccb4b37bfae2a026e20d6fbed206"}, - {file = "aiohttp-3.10.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4407a80bca3e694f2d2a523058e20e1f9f98a416619e04f6dc09dc910352ac8b"}, - {file = "aiohttp-3.10.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1cb045ec5961f51af3e2c08cd6fe523f07cc6e345033adee711c49b7b91bb954"}, - {file = "aiohttp-3.10.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4fabdcdc781a36b8fd7b2ca9dea8172f29a99e11d00ca0f83ffeb50958da84a1"}, - {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79a9f42efcc2681790595ab3d03c0e52d01edc23a0973ea09f0dc8d295e12b8e"}, - {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cca776a440795db437d82c07455761c85bbcf3956221c3c23b8c93176c278ce7"}, - {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5582de171f0898139cf51dd9fcdc79b848e28d9abd68e837f0803fc9f30807b1"}, - {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370e2d47575c53c817ee42a18acc34aad8da4dbdaac0a6c836d58878955f1477"}, - {file = "aiohttp-3.10.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:444d1704e2af6b30766debed9be8a795958029e552fe77551355badb1944012c"}, - {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40271a2a375812967401c9ca8077de9368e09a43a964f4dce0ff603301ec9358"}, - {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:f3af26f86863fad12e25395805bb0babbd49d512806af91ec9708a272b696248"}, - {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4752df44df48fd42b80f51d6a97553b482cda1274d9dc5df214a3a1aa5d8f018"}, - {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:2cd5290ab66cfca2f90045db2cc6434c1f4f9fbf97c9f1c316e785033782e7d2"}, - {file = "aiohttp-3.10.6-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:3427031064b0d5c95647e6369c4aa3c556402f324a3e18107cb09517abe5f962"}, - {file = "aiohttp-3.10.6-cp38-cp38-win32.whl", hash = "sha256:614fc21e86adc28e4165a6391f851a6da6e9cbd7bb232d0df7718b453a89ee98"}, - {file = "aiohttp-3.10.6-cp38-cp38-win_amd64.whl", hash = "sha256:58c5d7318a136a3874c78717dd6de57519bc64f6363c5827c2b1cb775bea71dd"}, - {file = "aiohttp-3.10.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5db26bbca8e7968c4c977a0c640e0b9ce7224e1f4dcafa57870dc6ee28e27de6"}, - {file = "aiohttp-3.10.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3fb4216e3ec0dbc01db5ba802f02ed78ad8f07121be54eb9e918448cc3f61b7c"}, - {file = "aiohttp-3.10.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a976ef488f26e224079deb3d424f29144c6d5ba4ded313198169a8af8f47fb82"}, - {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a86610174de8a85a920e956e2d4f9945e7da89f29a00e95ac62a4a414c4ef4e"}, - {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:217791c6a399cc4f2e6577bb44344cba1f5714a2aebf6a0bea04cfa956658284"}, - {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ba3662d41abe2eab0eeec7ee56f33ef4e0b34858f38abf24377687f9e1fb00a5"}, - {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4dfa5ad4bce9ca30a76117fbaa1c1decf41ebb6c18a4e098df44298941566f9"}, - {file = "aiohttp-3.10.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e0009258e97502936d3bd5bf2ced15769629097d0abb81e6495fba1047824fe0"}, - {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0a75d5c9fb4f06c41d029ae70ad943c3a844c40c0a769d12be4b99b04f473d3d"}, - {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8198b7c002aae2b40b2d16bfe724b9a90bcbc9b78b2566fc96131ef4e382574d"}, - {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:4611db8c907f90fe86be112efdc2398cd7b4c8eeded5a4f0314b70fdea8feab0"}, - {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ff99ae06eef85c7a565854826114ced72765832ee16c7e3e766c5e4c5b98d20e"}, - {file = "aiohttp-3.10.6-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7641920bdcc7cd2d3ddfb8bb9133a6c9536b09dbd49490b79e125180b2d25b93"}, - {file = "aiohttp-3.10.6-cp39-cp39-win32.whl", hash = "sha256:e2e7d5591ea868d5ec82b90bbeb366a198715672841d46281b623e23079593db"}, - {file = "aiohttp-3.10.6-cp39-cp39-win_amd64.whl", hash = "sha256:b504c08c45623bf5c7ca41be380156d925f00199b3970efd758aef4a77645feb"}, - {file = "aiohttp-3.10.6.tar.gz", hash = "sha256:d2578ef941be0c2ba58f6f421a703527d08427237ed45ecb091fed6f83305336"}, + {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:be7443669ae9c016b71f402e43208e13ddf00912f47f623ee5994e12fc7d4b3f"}, + {file = "aiohttp-3.10.10-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7b06b7843929e41a94ea09eb1ce3927865387e3e23ebe108e0d0d09b08d25be9"}, + {file = "aiohttp-3.10.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:333cf6cf8e65f6a1e06e9eb3e643a0c515bb850d470902274239fea02033e9a8"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:274cfa632350225ce3fdeb318c23b4a10ec25c0e2c880eff951a3842cf358ac1"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d9e5e4a85bdb56d224f412d9c98ae4cbd032cc4f3161818f692cd81766eee65a"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b606353da03edcc71130b52388d25f9a30a126e04caef1fd637e31683033abd"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ab5a5a0c7a7991d90446a198689c0535be89bbd6b410a1f9a66688f0880ec026"}, + {file = "aiohttp-3.10.10-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:578a4b875af3e0daaf1ac6fa983d93e0bbfec3ead753b6d6f33d467100cdc67b"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:8105fd8a890df77b76dd3054cddf01a879fc13e8af576805d667e0fa0224c35d"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3bcd391d083f636c06a68715e69467963d1f9600f85ef556ea82e9ef25f043f7"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fbc6264158392bad9df19537e872d476f7c57adf718944cc1e4495cbabf38e2a"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e48d5021a84d341bcaf95c8460b152cfbad770d28e5fe14a768988c461b821bc"}, + {file = "aiohttp-3.10.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2609e9ab08474702cc67b7702dbb8a80e392c54613ebe80db7e8dbdb79837c68"}, + {file = "aiohttp-3.10.10-cp310-cp310-win32.whl", hash = "sha256:84afcdea18eda514c25bc68b9af2a2b1adea7c08899175a51fe7c4fb6d551257"}, + {file = "aiohttp-3.10.10-cp310-cp310-win_amd64.whl", hash = "sha256:9c72109213eb9d3874f7ac8c0c5fa90e072d678e117d9061c06e30c85b4cf0e6"}, + {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c30a0eafc89d28e7f959281b58198a9fa5e99405f716c0289b7892ca345fe45f"}, + {file = "aiohttp-3.10.10-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:258c5dd01afc10015866114e210fb7365f0d02d9d059c3c3415382ab633fcbcb"}, + {file = "aiohttp-3.10.10-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:15ecd889a709b0080f02721255b3f80bb261c2293d3c748151274dfea93ac871"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3935f82f6f4a3820270842e90456ebad3af15810cf65932bd24da4463bc0a4c"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:413251f6fcf552a33c981c4709a6bba37b12710982fec8e558ae944bfb2abd38"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1720b4f14c78a3089562b8875b53e36b51c97c51adc53325a69b79b4b48ebcb"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:679abe5d3858b33c2cf74faec299fda60ea9de62916e8b67e625d65bf069a3b7"}, + {file = "aiohttp-3.10.10-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79019094f87c9fb44f8d769e41dbb664d6e8fcfd62f665ccce36762deaa0e911"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fe2fb38c2ed905a2582948e2de560675e9dfbee94c6d5ccdb1301c6d0a5bf092"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:a3f00003de6eba42d6e94fabb4125600d6e484846dbf90ea8e48a800430cc142"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1bbb122c557a16fafc10354b9d99ebf2f2808a660d78202f10ba9d50786384b9"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:30ca7c3b94708a9d7ae76ff281b2f47d8eaf2579cd05971b5dc681db8caac6e1"}, + {file = "aiohttp-3.10.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:df9270660711670e68803107d55c2b5949c2e0f2e4896da176e1ecfc068b974a"}, + {file = "aiohttp-3.10.10-cp311-cp311-win32.whl", hash = "sha256:aafc8ee9b742ce75044ae9a4d3e60e3d918d15a4c2e08a6c3c3e38fa59b92d94"}, + {file = "aiohttp-3.10.10-cp311-cp311-win_amd64.whl", hash = "sha256:362f641f9071e5f3ee6f8e7d37d5ed0d95aae656adf4ef578313ee585b585959"}, + {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9294bbb581f92770e6ed5c19559e1e99255e4ca604a22c5c6397b2f9dd3ee42c"}, + {file = "aiohttp-3.10.10-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a8fa23fe62c436ccf23ff930149c047f060c7126eae3ccea005f0483f27b2e28"}, + {file = "aiohttp-3.10.10-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5c6a5b8c7926ba5d8545c7dd22961a107526562da31a7a32fa2456baf040939f"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:007ec22fbc573e5eb2fb7dec4198ef8f6bf2fe4ce20020798b2eb5d0abda6138"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9627cc1a10c8c409b5822a92d57a77f383b554463d1884008e051c32ab1b3742"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:50edbcad60d8f0e3eccc68da67f37268b5144ecc34d59f27a02f9611c1d4eec7"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a45d85cf20b5e0d0aa5a8dca27cce8eddef3292bc29d72dcad1641f4ed50aa16"}, + {file = "aiohttp-3.10.10-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b00807e2605f16e1e198f33a53ce3c4523114059b0c09c337209ae55e3823a8"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f2d4324a98062be0525d16f768a03e0bbb3b9fe301ceee99611dc9a7953124e6"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:438cd072f75bb6612f2aca29f8bd7cdf6e35e8f160bc312e49fbecab77c99e3a"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:baa42524a82f75303f714108fea528ccacf0386af429b69fff141ffef1c534f9"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a7d8d14fe962153fc681f6366bdec33d4356f98a3e3567782aac1b6e0e40109a"}, + {file = "aiohttp-3.10.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c1277cd707c465cd09572a774559a3cc7c7a28802eb3a2a9472588f062097205"}, + {file = "aiohttp-3.10.10-cp312-cp312-win32.whl", hash = "sha256:59bb3c54aa420521dc4ce3cc2c3fe2ad82adf7b09403fa1f48ae45c0cbde6628"}, + {file = "aiohttp-3.10.10-cp312-cp312-win_amd64.whl", hash = "sha256:0e1b370d8007c4ae31ee6db7f9a2fe801a42b146cec80a86766e7ad5c4a259cf"}, + {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ad7593bb24b2ab09e65e8a1d385606f0f47c65b5a2ae6c551db67d6653e78c28"}, + {file = "aiohttp-3.10.10-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1eb89d3d29adaf533588f209768a9c02e44e4baf832b08118749c5fad191781d"}, + {file = "aiohttp-3.10.10-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3fe407bf93533a6fa82dece0e74dbcaaf5d684e5a51862887f9eaebe6372cd79"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50aed5155f819873d23520919e16703fc8925e509abbb1a1491b0087d1cd969e"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f05e9727ce409358baa615dbeb9b969db94324a79b5a5cea45d39bdb01d82e6"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3dffb610a30d643983aeb185ce134f97f290f8935f0abccdd32c77bed9388b42"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aa6658732517ddabe22c9036479eabce6036655ba87a0224c612e1ae6af2087e"}, + {file = "aiohttp-3.10.10-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:741a46d58677d8c733175d7e5aa618d277cd9d880301a380fd296975a9cdd7bc"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:e00e3505cd80440f6c98c6d69269dcc2a119f86ad0a9fd70bccc59504bebd68a"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ffe595f10566f8276b76dc3a11ae4bb7eba1aac8ddd75811736a15b0d5311414"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdfcf6443637c148c4e1a20c48c566aa694fa5e288d34b20fcdc58507882fed3"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d183cf9c797a5291e8301790ed6d053480ed94070637bfaad914dd38b0981f67"}, + {file = "aiohttp-3.10.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:77abf6665ae54000b98b3c742bc6ea1d1fb31c394bcabf8b5d2c1ac3ebfe7f3b"}, + {file = "aiohttp-3.10.10-cp313-cp313-win32.whl", hash = "sha256:4470c73c12cd9109db8277287d11f9dd98f77fc54155fc71a7738a83ffcc8ea8"}, + {file = "aiohttp-3.10.10-cp313-cp313-win_amd64.whl", hash = "sha256:486f7aabfa292719a2753c016cc3a8f8172965cabb3ea2e7f7436c7f5a22a151"}, + {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:1b66ccafef7336a1e1f0e389901f60c1d920102315a56df85e49552308fc0486"}, + {file = "aiohttp-3.10.10-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:acd48d5b80ee80f9432a165c0ac8cbf9253eaddb6113269a5e18699b33958dbb"}, + {file = "aiohttp-3.10.10-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3455522392fb15ff549d92fbf4b73b559d5e43dc522588f7eb3e54c3f38beee7"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45c3b868724137f713a38376fef8120c166d1eadd50da1855c112fe97954aed8"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:da1dee8948d2137bb51fbb8a53cce6b1bcc86003c6b42565f008438b806cccd8"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c5ce2ce7c997e1971b7184ee37deb6ea9922ef5163c6ee5aa3c274b05f9e12fa"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28529e08fde6f12eba8677f5a8608500ed33c086f974de68cc65ab218713a59d"}, + {file = "aiohttp-3.10.10-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f7db54c7914cc99d901d93a34704833568d86c20925b2762f9fa779f9cd2e70f"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:03a42ac7895406220124c88911ebee31ba8b2d24c98507f4a8bf826b2937c7f2"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:7e338c0523d024fad378b376a79faff37fafb3c001872a618cde1d322400a572"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:038f514fe39e235e9fef6717fbf944057bfa24f9b3db9ee551a7ecf584b5b480"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:64f6c17757251e2b8d885d728b6433d9d970573586a78b78ba8929b0f41d045a"}, + {file = "aiohttp-3.10.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:93429602396f3383a797a2a70e5f1de5df8e35535d7806c9f91df06f297e109b"}, + {file = "aiohttp-3.10.10-cp38-cp38-win32.whl", hash = "sha256:c823bc3971c44ab93e611ab1a46b1eafeae474c0c844aff4b7474287b75fe49c"}, + {file = "aiohttp-3.10.10-cp38-cp38-win_amd64.whl", hash = "sha256:54ca74df1be3c7ca1cf7f4c971c79c2daf48d9aa65dea1a662ae18926f5bc8ce"}, + {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:01948b1d570f83ee7bbf5a60ea2375a89dfb09fd419170e7f5af029510033d24"}, + {file = "aiohttp-3.10.10-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9fc1500fd2a952c5c8e3b29aaf7e3cc6e27e9cfc0a8819b3bce48cc1b849e4cc"}, + {file = "aiohttp-3.10.10-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f614ab0c76397661b90b6851a030004dac502e48260ea10f2441abd2207fbcc7"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:00819de9e45d42584bed046314c40ea7e9aea95411b38971082cad449392b08c"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05646ebe6b94cc93407b3bf34b9eb26c20722384d068eb7339de802154d61bc5"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:998f3bd3cfc95e9424a6acd7840cbdd39e45bc09ef87533c006f94ac47296090"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9010c31cd6fa59438da4e58a7f19e4753f7f264300cd152e7f90d4602449762"}, + {file = "aiohttp-3.10.10-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ea7ffc6d6d6f8a11e6f40091a1040995cdff02cfc9ba4c2f30a516cb2633554"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:ef9c33cc5cbca35808f6c74be11eb7f5f6b14d2311be84a15b594bd3e58b5527"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ce0cdc074d540265bfeb31336e678b4e37316849d13b308607efa527e981f5c2"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:597a079284b7ee65ee102bc3a6ea226a37d2b96d0418cc9047490f231dc09fe8"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7789050d9e5d0c309c706953e5e8876e38662d57d45f936902e176d19f1c58ab"}, + {file = "aiohttp-3.10.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:e7f8b04d83483577fd9200461b057c9f14ced334dcb053090cea1da9c8321a91"}, + {file = "aiohttp-3.10.10-cp39-cp39-win32.whl", hash = "sha256:c02a30b904282777d872266b87b20ed8cc0d1501855e27f831320f471d54d983"}, + {file = "aiohttp-3.10.10-cp39-cp39-win_amd64.whl", hash = "sha256:edfe3341033a6b53a5c522c802deb2079eee5cbfbb0af032a55064bd65c73a23"}, + {file = "aiohttp-3.10.10.tar.gz", hash = "sha256:0631dd7c9f0822cc61c88586ca76d5b5ada26538097d0f1df510b082bad3411a"}, ] [package.dependencies] @@ -240,101 +240,116 @@ files = [ [[package]] name = "charset-normalizer" -version = "3.3.2" +version = "3.4.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7.0" files = [ - {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, - {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"}, + {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"}, + {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, ] [[package]] @@ -353,13 +368,13 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} [[package]] name = "cloudpickle" -version = "3.0.0" +version = "3.1.0" description = "Pickler class to extend the standard pickle.Pickler functionality" optional = true python-versions = ">=3.8" files = [ - {file = "cloudpickle-3.0.0-py3-none-any.whl", hash = "sha256:246ee7d0c295602a036e86369c77fecda4ab17b506496730f2f576d9016fd9c7"}, - {file = "cloudpickle-3.0.0.tar.gz", hash = "sha256:996d9a482c6fb4f33c1a35335cf8afd065d2a56e973270364840712d9131a882"}, + {file = "cloudpickle-3.1.0-py3-none-any.whl", hash = "sha256:fe11acda67f61aaaec473e3afe030feb131d78a43461b718185363384f1ba12e"}, + {file = "cloudpickle-3.1.0.tar.gz", hash = "sha256:81a929b6e3c7335c863c771d673d105f02efdb89dfaba0c90495d1c64796601b"}, ] [[package]] @@ -665,61 +680,70 @@ testing = ["protobuf (>=4.21.9)"] [[package]] name = "grpcio" -version = "1.66.1" +version = "1.67.0" description = "HTTP/2-based RPC framework" optional = false python-versions = ">=3.8" files = [ - {file = "grpcio-1.66.1-cp310-cp310-linux_armv7l.whl", hash = "sha256:4877ba180591acdf127afe21ec1c7ff8a5ecf0fe2600f0d3c50e8c4a1cbc6492"}, - {file = "grpcio-1.66.1-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:3750c5a00bd644c75f4507f77a804d0189d97a107eb1481945a0cf3af3e7a5ac"}, - {file = "grpcio-1.66.1-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:a013c5fbb12bfb5f927444b477a26f1080755a931d5d362e6a9a720ca7dbae60"}, - {file = "grpcio-1.66.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b1b24c23d51a1e8790b25514157d43f0a4dce1ac12b3f0b8e9f66a5e2c4c132f"}, - {file = "grpcio-1.66.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7ffb8ea674d68de4cac6f57d2498fef477cef582f1fa849e9f844863af50083"}, - {file = "grpcio-1.66.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:307b1d538140f19ccbd3aed7a93d8f71103c5d525f3c96f8616111614b14bf2a"}, - {file = "grpcio-1.66.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1c17ebcec157cfb8dd445890a03e20caf6209a5bd4ac5b040ae9dbc59eef091d"}, - {file = "grpcio-1.66.1-cp310-cp310-win32.whl", hash = "sha256:ef82d361ed5849d34cf09105d00b94b6728d289d6b9235513cb2fcc79f7c432c"}, - {file = "grpcio-1.66.1-cp310-cp310-win_amd64.whl", hash = "sha256:292a846b92cdcd40ecca46e694997dd6b9be6c4c01a94a0dfb3fcb75d20da858"}, - {file = "grpcio-1.66.1-cp311-cp311-linux_armv7l.whl", hash = "sha256:c30aeceeaff11cd5ddbc348f37c58bcb96da8d5aa93fed78ab329de5f37a0d7a"}, - {file = "grpcio-1.66.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:8a1e224ce6f740dbb6b24c58f885422deebd7eb724aff0671a847f8951857c26"}, - {file = "grpcio-1.66.1-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:a66fe4dc35d2330c185cfbb42959f57ad36f257e0cc4557d11d9f0a3f14311df"}, - {file = "grpcio-1.66.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ba04659e4fce609de2658fe4dbf7d6ed21987a94460f5f92df7579fd5d0e22"}, - {file = "grpcio-1.66.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4573608e23f7e091acfbe3e84ac2045680b69751d8d67685ffa193a4429fedb1"}, - {file = "grpcio-1.66.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7e06aa1f764ec8265b19d8f00140b8c4b6ca179a6dc67aa9413867c47e1fb04e"}, - {file = "grpcio-1.66.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3885f037eb11f1cacc41f207b705f38a44b69478086f40608959bf5ad85826dd"}, - {file = "grpcio-1.66.1-cp311-cp311-win32.whl", hash = "sha256:97ae7edd3f3f91480e48ede5d3e7d431ad6005bfdbd65c1b56913799ec79e791"}, - {file = "grpcio-1.66.1-cp311-cp311-win_amd64.whl", hash = "sha256:cfd349de4158d797db2bd82d2020554a121674e98fbe6b15328456b3bf2495bb"}, - {file = "grpcio-1.66.1-cp312-cp312-linux_armv7l.whl", hash = "sha256:a92c4f58c01c77205df6ff999faa008540475c39b835277fb8883b11cada127a"}, - {file = "grpcio-1.66.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:fdb14bad0835914f325349ed34a51940bc2ad965142eb3090081593c6e347be9"}, - {file = "grpcio-1.66.1-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:f03a5884c56256e08fd9e262e11b5cfacf1af96e2ce78dc095d2c41ccae2c80d"}, - {file = "grpcio-1.66.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2ca2559692d8e7e245d456877a85ee41525f3ed425aa97eb7a70fc9a79df91a0"}, - {file = "grpcio-1.66.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84ca1be089fb4446490dd1135828bd42a7c7f8421e74fa581611f7afdf7ab761"}, - {file = "grpcio-1.66.1-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:d639c939ad7c440c7b2819a28d559179a4508783f7e5b991166f8d7a34b52815"}, - {file = "grpcio-1.66.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:b9feb4e5ec8dc2d15709f4d5fc367794d69277f5d680baf1910fc9915c633524"}, - {file = "grpcio-1.66.1-cp312-cp312-win32.whl", hash = "sha256:7101db1bd4cd9b880294dec41a93fcdce465bdbb602cd8dc5bd2d6362b618759"}, - {file = "grpcio-1.66.1-cp312-cp312-win_amd64.whl", hash = "sha256:b0aa03d240b5539648d996cc60438f128c7f46050989e35b25f5c18286c86734"}, - {file = "grpcio-1.66.1-cp38-cp38-linux_armv7l.whl", hash = "sha256:ecfe735e7a59e5a98208447293ff8580e9db1e890e232b8b292dc8bd15afc0d2"}, - {file = "grpcio-1.66.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4825a3aa5648010842e1c9d35a082187746aa0cdbf1b7a2a930595a94fb10fce"}, - {file = "grpcio-1.66.1-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f517fd7259fe823ef3bd21e508b653d5492e706e9f0ef82c16ce3347a8a5620c"}, - {file = "grpcio-1.66.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1fe60d0772831d96d263b53d83fb9a3d050a94b0e94b6d004a5ad111faa5b5b"}, - {file = "grpcio-1.66.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31a049daa428f928f21090403e5d18ea02670e3d5d172581670be006100db9ef"}, - {file = "grpcio-1.66.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:6f914386e52cbdeb5d2a7ce3bf1fdfacbe9d818dd81b6099a05b741aaf3848bb"}, - {file = "grpcio-1.66.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bff2096bdba686019fb32d2dde45b95981f0d1490e054400f70fc9a8af34b49d"}, - {file = "grpcio-1.66.1-cp38-cp38-win32.whl", hash = "sha256:aa8ba945c96e73de29d25331b26f3e416e0c0f621e984a3ebdb2d0d0b596a3b3"}, - {file = "grpcio-1.66.1-cp38-cp38-win_amd64.whl", hash = "sha256:161d5c535c2bdf61b95080e7f0f017a1dfcb812bf54093e71e5562b16225b4ce"}, - {file = "grpcio-1.66.1-cp39-cp39-linux_armv7l.whl", hash = "sha256:d0cd7050397b3609ea51727b1811e663ffda8bda39c6a5bb69525ef12414b503"}, - {file = "grpcio-1.66.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:0e6c9b42ded5d02b6b1fea3a25f036a2236eeb75d0579bfd43c0018c88bf0a3e"}, - {file = "grpcio-1.66.1-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:c9f80f9fad93a8cf71c7f161778ba47fd730d13a343a46258065c4deb4b550c0"}, - {file = "grpcio-1.66.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dd67ed9da78e5121efc5c510f0122a972216808d6de70953a740560c572eb44"}, - {file = "grpcio-1.66.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48b0d92d45ce3be2084b92fb5bae2f64c208fea8ceed7fccf6a7b524d3c4942e"}, - {file = "grpcio-1.66.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d813316d1a752be6f5c4360c49f55b06d4fe212d7df03253dfdae90c8a402bb"}, - {file = "grpcio-1.66.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:9c9bebc6627873ec27a70fc800f6083a13c70b23a5564788754b9ee52c5aef6c"}, - {file = "grpcio-1.66.1-cp39-cp39-win32.whl", hash = "sha256:30a1c2cf9390c894c90bbc70147f2372130ad189cffef161f0432d0157973f45"}, - {file = "grpcio-1.66.1-cp39-cp39-win_amd64.whl", hash = "sha256:17663598aadbedc3cacd7bbde432f541c8e07d2496564e22b214b22c7523dac8"}, - {file = "grpcio-1.66.1.tar.gz", hash = "sha256:35334f9c9745add3e357e3372756fd32d925bd52c41da97f4dfdafbde0bf0ee2"}, + {file = "grpcio-1.67.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:bd79929b3bb96b54df1296cd3bf4d2b770bd1df6c2bdf549b49bab286b925cdc"}, + {file = "grpcio-1.67.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:16724ffc956ea42967f5758c2f043faef43cb7e48a51948ab593570570d1e68b"}, + {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:2b7183c80b602b0ad816315d66f2fb7887614ead950416d60913a9a71c12560d"}, + {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:efe32b45dd6d118f5ea2e5deaed417d8a14976325c93812dd831908522b402c9"}, + {file = "grpcio-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe89295219b9c9e47780a0f1c75ca44211e706d1c598242249fe717af3385ec8"}, + {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:aa8d025fae1595a207b4e47c2e087cb88d47008494db258ac561c00877d4c8f8"}, + {file = "grpcio-1.67.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f95e15db43e75a534420e04822df91f645664bf4ad21dfaad7d51773c80e6bb4"}, + {file = "grpcio-1.67.0-cp310-cp310-win32.whl", hash = "sha256:a6b9a5c18863fd4b6624a42e2712103fb0f57799a3b29651c0e5b8119a519d65"}, + {file = "grpcio-1.67.0-cp310-cp310-win_amd64.whl", hash = "sha256:b6eb68493a05d38b426604e1dc93bfc0137c4157f7ab4fac5771fd9a104bbaa6"}, + {file = "grpcio-1.67.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:e91d154689639932305b6ea6f45c6e46bb51ecc8ea77c10ef25aa77f75443ad4"}, + {file = "grpcio-1.67.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:cb204a742997277da678611a809a8409657b1398aaeebf73b3d9563b7d154c13"}, + {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_aarch64.whl", hash = "sha256:ae6de510f670137e755eb2a74b04d1041e7210af2444103c8c95f193340d17ee"}, + {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74b900566bdf68241118f2918d312d3bf554b2ce0b12b90178091ea7d0a17b3d"}, + {file = "grpcio-1.67.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4e95e43447a02aa603abcc6b5e727d093d161a869c83b073f50b9390ecf0fa8"}, + {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:0bb94e66cd8f0baf29bd3184b6aa09aeb1a660f9ec3d85da615c5003154bc2bf"}, + {file = "grpcio-1.67.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:82e5bd4b67b17c8c597273663794a6a46a45e44165b960517fe6d8a2f7f16d23"}, + {file = "grpcio-1.67.0-cp311-cp311-win32.whl", hash = "sha256:7fc1d2b9fd549264ae585026b266ac2db53735510a207381be509c315b4af4e8"}, + {file = "grpcio-1.67.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac11ecb34a86b831239cc38245403a8de25037b448464f95c3315819e7519772"}, + {file = "grpcio-1.67.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:227316b5631260e0bef8a3ce04fa7db4cc81756fea1258b007950b6efc90c05d"}, + {file = "grpcio-1.67.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:d90cfdafcf4b45a7a076e3e2a58e7bc3d59c698c4f6470b0bb13a4d869cf2273"}, + {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_aarch64.whl", hash = "sha256:77196216d5dd6f99af1c51e235af2dd339159f657280e65ce7e12c1a8feffd1d"}, + {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15c05a26a0f7047f720da41dc49406b395c1470eef44ff7e2c506a47ac2c0591"}, + {file = "grpcio-1.67.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3840994689cc8cbb73d60485c594424ad8adb56c71a30d8948d6453083624b52"}, + {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:5a1e03c3102b6451028d5dc9f8591131d6ab3c8a0e023d94c28cb930ed4b5f81"}, + {file = "grpcio-1.67.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:682968427a63d898759474e3b3178d42546e878fdce034fd7474ef75143b64e3"}, + {file = "grpcio-1.67.0-cp312-cp312-win32.whl", hash = "sha256:d01793653248f49cf47e5695e0a79805b1d9d4eacef85b310118ba1dfcd1b955"}, + {file = "grpcio-1.67.0-cp312-cp312-win_amd64.whl", hash = "sha256:985b2686f786f3e20326c4367eebdaed3e7aa65848260ff0c6644f817042cb15"}, + {file = "grpcio-1.67.0-cp313-cp313-linux_armv7l.whl", hash = "sha256:8c9a35b8bc50db35ab8e3e02a4f2a35cfba46c8705c3911c34ce343bd777813a"}, + {file = "grpcio-1.67.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:42199e704095b62688998c2d84c89e59a26a7d5d32eed86d43dc90e7a3bd04aa"}, + {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_aarch64.whl", hash = "sha256:c4c425f440fb81f8d0237c07b9322fc0fb6ee2b29fbef5f62a322ff8fcce240d"}, + {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:323741b6699cd2b04a71cb38f502db98f90532e8a40cb675393d248126a268af"}, + {file = "grpcio-1.67.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:662c8e105c5e5cee0317d500eb186ed7a93229586e431c1bf0c9236c2407352c"}, + {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:f6bd2ab135c64a4d1e9e44679a616c9bc944547357c830fafea5c3caa3de5153"}, + {file = "grpcio-1.67.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:2f55c1e0e2ae9bdd23b3c63459ee4c06d223b68aeb1961d83c48fb63dc29bc03"}, + {file = "grpcio-1.67.0-cp313-cp313-win32.whl", hash = "sha256:fd6bc27861e460fe28e94226e3673d46e294ca4673d46b224428d197c5935e69"}, + {file = "grpcio-1.67.0-cp313-cp313-win_amd64.whl", hash = "sha256:cf51d28063338608cd8d3cd64677e922134837902b70ce00dad7f116e3998210"}, + {file = "grpcio-1.67.0-cp38-cp38-linux_armv7l.whl", hash = "sha256:7f200aca719c1c5dc72ab68be3479b9dafccdf03df530d137632c534bb6f1ee3"}, + {file = "grpcio-1.67.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:0892dd200ece4822d72dd0952f7112c542a487fc48fe77568deaaa399c1e717d"}, + {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_aarch64.whl", hash = "sha256:f4d613fbf868b2e2444f490d18af472ccb47660ea3df52f068c9c8801e1f3e85"}, + {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0c69bf11894cad9da00047f46584d5758d6ebc9b5950c0dc96fec7e0bce5cde9"}, + {file = "grpcio-1.67.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9bca3ca0c5e74dea44bf57d27e15a3a3996ce7e5780d61b7c72386356d231db"}, + {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:014dfc020e28a0d9be7e93a91f85ff9f4a87158b7df9952fe23cc42d29d31e1e"}, + {file = "grpcio-1.67.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4ea4509d42c6797539e9ec7496c15473177ce9abc89bc5c71e7abe50fc25737"}, + {file = "grpcio-1.67.0-cp38-cp38-win32.whl", hash = "sha256:9d75641a2fca9ae1ae86454fd25d4c298ea8cc195dbc962852234d54a07060ad"}, + {file = "grpcio-1.67.0-cp38-cp38-win_amd64.whl", hash = "sha256:cff8e54d6a463883cda2fab94d2062aad2f5edd7f06ae3ed030f2a74756db365"}, + {file = "grpcio-1.67.0-cp39-cp39-linux_armv7l.whl", hash = "sha256:62492bd534979e6d7127b8a6b29093161a742dee3875873e01964049d5250a74"}, + {file = "grpcio-1.67.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eef1dce9d1a46119fd09f9a992cf6ab9d9178b696382439446ca5f399d7b96fe"}, + {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_aarch64.whl", hash = "sha256:f623c57a5321461c84498a99dddf9d13dac0e40ee056d884d6ec4ebcab647a78"}, + {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54d16383044e681f8beb50f905249e4e7261dd169d4aaf6e52eab67b01cbbbe2"}, + {file = "grpcio-1.67.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b2a44e572fb762c668e4812156b81835f7aba8a721b027e2d4bb29fb50ff4d33"}, + {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:391df8b0faac84d42f5b8dfc65f5152c48ed914e13c522fd05f2aca211f8bfad"}, + {file = "grpcio-1.67.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:cfd9306511fdfc623a1ba1dc3bc07fbd24e6cfbe3c28b4d1e05177baa2f99617"}, + {file = "grpcio-1.67.0-cp39-cp39-win32.whl", hash = "sha256:30d47dbacfd20cbd0c8be9bfa52fdb833b395d4ec32fe5cff7220afc05d08571"}, + {file = "grpcio-1.67.0-cp39-cp39-win_amd64.whl", hash = "sha256:f55f077685f61f0fbd06ea355142b71e47e4a26d2d678b3ba27248abfe67163a"}, + {file = "grpcio-1.67.0.tar.gz", hash = "sha256:e090b2553e0da1c875449c8e75073dd4415dd71c9bde6a406240fdf4c0ee467c"}, ] [package.extras] -protobuf = ["grpcio-tools (>=1.66.1)"] +protobuf = ["grpcio-tools (>=1.67.0)"] [[package]] name = "grpcio-reflection" @@ -1018,13 +1042,13 @@ format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339- [[package]] name = "jsonschema-specifications" -version = "2023.12.1" +version = "2024.10.1" description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" optional = true -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "jsonschema_specifications-2023.12.1-py3-none-any.whl", hash = "sha256:87e4fdf3a94858b8a2ba2778d9ba57d8a9cafca7c7489c46ba0d30a8bc6a9c3c"}, - {file = "jsonschema_specifications-2023.12.1.tar.gz", hash = "sha256:48a76787b3e70f5ed53f1160d2b81f586e4ca6d1548c5de7085d1682674764cc"}, + {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"}, + {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"}, ] [package.dependencies] @@ -1121,81 +1145,82 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] [[package]] name = "markupsafe" -version = "2.1.5" +version = "3.0.2" description = "Safely add untrusted strings to HTML/XML markup." optional = true -python-versions = ">=3.7" +python-versions = ">=3.9" files = [ - {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, - {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, - {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, - {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, - {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, - {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, - {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, - {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win32.whl", hash = "sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50"}, + {file = "MarkupSafe-3.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win32.whl", hash = "sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d"}, + {file = "MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30"}, + {file = "MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1"}, + {file = "MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6"}, + {file = "MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win32.whl", hash = "sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f"}, + {file = "MarkupSafe-3.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a"}, + {file = "markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0"}, ] [[package]] name = "marlin-kernels" -version = "0.2.0" +version = "0.3.0" description = "Marlin quantization kernels" optional = true python-versions = ">=3.7" files = [ - {file = "marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:9a5afcf19b0f5917e43353cc19873fb3c4d4d0b924e2a95a37884f9ce208d0bd"}, + {file = "marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", hash = "sha256:a2086b9e98d22071f52c5b4b4b98b1b4a988565258905173fa74c5a9eddd1a0a"}, ] [package.dependencies] @@ -1203,16 +1228,16 @@ torch = "*" [package.source] type = "url" -url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" +url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl" [[package]] name = "marlin-kernels" -version = "0.2.0" +version = "0.3.0" description = "Marlin quantization kernels" optional = true python-versions = ">=3.7" files = [ - {file = "marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:1e64fcc7ebadfaffa60091ee9201ae3daaf5c1be3be60c8c054143a3dcb72d5d"}, + {file = "marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", hash = "sha256:f39a6946d8247629446ec170832d832c7038c363f1d8803211fe67249c2d804d"}, ] [package.dependencies] @@ -1220,16 +1245,16 @@ torch = "*" [package.source] type = "url" -url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" +url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl" [[package]] name = "marlin-kernels" -version = "0.2.0" +version = "0.3.0" description = "Marlin quantization kernels" optional = true python-versions = ">=3.7" files = [ - {file = "marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:e75f3ce9b1c13a4ed43a380d88e1d34d297259452db037ec1973ec33dc2eb78e"}, + {file = "marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", hash = "sha256:07fd869d5289777fa866107dae676523e18b1f6ba4afce79946ddc58a6870169"}, ] [package.dependencies] @@ -1237,16 +1262,16 @@ torch = "*" [package.source] type = "url" -url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" +url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl" [[package]] name = "marlin-kernels" -version = "0.2.0" +version = "0.3.0" description = "Marlin quantization kernels" optional = true python-versions = ">=3.7" files = [ - {file = "marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:2f99a27f70b391887ee6adffeeee7c3f4df7fac37393f9fb16d4cace2b3f6457"}, + {file = "marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", hash = "sha256:0dedaa418225d490a5f1d8f85dbc75e439a8c43a8870e4ef32945bf61672d7dc"}, ] [package.dependencies] @@ -1254,7 +1279,7 @@ torch = "*" [package.source] type = "url" -url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" +url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl" [[package]] name = "mdurl" @@ -1598,46 +1623,50 @@ files = [ [[package]] name = "nvidia-cublas-cu12" -version = "12.1.3.1" +version = "12.4.5.8" description = "CUBLAS native runtime libraries" optional = true python-versions = ">=3" files = [ - {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl", hash = "sha256:ee53ccca76a6fc08fb9701aa95b6ceb242cdaab118c3bb152af4e579af792728"}, - {file = "nvidia_cublas_cu12-12.1.3.1-py3-none-win_amd64.whl", hash = "sha256:2b964d60e8cf11b5e1073d179d85fa340c120e99b3067558f3cf98dd69d02906"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0f8aa1706812e00b9f19dfe0cdb3999b092ccb8ca168c0db5b8ea712456fd9b3"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl", hash = "sha256:2fc8da60df463fdefa81e323eef2e36489e1c94335b5358bcb38360adf75ac9b"}, + {file = "nvidia_cublas_cu12-12.4.5.8-py3-none-win_amd64.whl", hash = "sha256:5a796786da89203a0657eda402bcdcec6180254a8ac22d72213abc42069522dc"}, ] [[package]] name = "nvidia-cuda-cupti-cu12" -version = "12.1.105" +version = "12.4.127" description = "CUDA profiling tools runtime libs." optional = true python-versions = ">=3" files = [ - {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:e54fde3983165c624cb79254ae9818a456eb6e87a7fd4d56a2352c24ee542d7e"}, - {file = "nvidia_cuda_cupti_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:bea8236d13a0ac7190bd2919c3e8e6ce1e402104276e6f9694479e48bb0eb2a4"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:79279b35cf6f91da114182a5ce1864997fd52294a87a16179ce275773799458a"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:9dec60f5ac126f7bb551c055072b69d85392b13311fcc1bcda2202d172df30fb"}, + {file = "nvidia_cuda_cupti_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:5688d203301ab051449a2b1cb6690fbe90d2b372f411521c86018b950f3d7922"}, ] [[package]] name = "nvidia-cuda-nvrtc-cu12" -version = "12.1.105" +version = "12.4.127" description = "NVRTC native runtime libraries" optional = true python-versions = ">=3" files = [ - {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:339b385f50c309763ca65456ec75e17bbefcbbf2893f462cb8b90584cd27a1c2"}, - {file = "nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:0a98a522d9ff138b96c010a65e145dc1b4850e9ecb75a0172371793752fd46ed"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0eedf14185e04b76aa05b1fea04133e59f465b6f960c0cbf4e37c3cb6b0ea198"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a178759ebb095827bd30ef56598ec182b85547f1508941a3d560eb7ea1fbf338"}, + {file = "nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:a961b2f1d5f17b14867c619ceb99ef6fcec12e46612711bcec78eb05068a60ec"}, ] [[package]] name = "nvidia-cuda-runtime-cu12" -version = "12.1.105" +version = "12.4.127" description = "CUDA Runtime native Libraries" optional = true python-versions = ">=3" files = [ - {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:6e258468ddf5796e25f1dc591a31029fa317d97a0a94ed93468fc86301d61e40"}, - {file = "nvidia_cuda_runtime_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:dfb46ef84d73fababab44cf03e3b83f80700d27ca300e537f85f636fac474344"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:961fe0e2e716a2a1d967aab7caee97512f71767f852f67432d572e36cb3a11f3"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:64403288fa2136ee8e467cdc9c9427e0434110899d07c779f25b5c068934faa5"}, + {file = "nvidia_cuda_runtime_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:09c2e35f48359752dfa822c09918211844a3d93c100a715d79b59591130c5e1e"}, ] [[package]] @@ -1656,35 +1685,41 @@ nvidia-cublas-cu12 = "*" [[package]] name = "nvidia-cufft-cu12" -version = "11.0.2.54" +version = "11.2.1.3" description = "CUFFT native runtime libraries" optional = true python-versions = ">=3" files = [ - {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl", hash = "sha256:794e3948a1aa71fd817c3775866943936774d1c14e7628c74f6f7417224cdf56"}, - {file = "nvidia_cufft_cu12-11.0.2.54-py3-none-win_amd64.whl", hash = "sha256:d9ac353f78ff89951da4af698f80870b1534ed69993f10a4cf1d96f21357e253"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:5dad8008fc7f92f5ddfa2101430917ce2ffacd86824914c82e28990ad7f00399"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9"}, + {file = "nvidia_cufft_cu12-11.2.1.3-py3-none-win_amd64.whl", hash = "sha256:d802f4954291101186078ccbe22fc285a902136f974d369540fd4a5333d1440b"}, ] +[package.dependencies] +nvidia-nvjitlink-cu12 = "*" + [[package]] name = "nvidia-curand-cu12" -version = "10.3.2.106" +version = "10.3.5.147" description = "CURAND native runtime libraries" optional = true python-versions = ">=3" files = [ - {file = "nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:9d264c5036dde4e64f1de8c50ae753237c12e0b1348738169cd0f8a536c0e1e0"}, - {file = "nvidia_curand_cu12-10.3.2.106-py3-none-win_amd64.whl", hash = "sha256:75b6b0c574c0037839121317e17fd01f8a69fd2ef8e25853d826fec30bdba74a"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1f173f09e3e3c76ab084aba0de819c49e56614feae5c12f69883f4ae9bb5fad9"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a88f583d4e0bb643c49743469964103aa59f7f708d862c3ddb0fc07f851e3b8b"}, + {file = "nvidia_curand_cu12-10.3.5.147-py3-none-win_amd64.whl", hash = "sha256:f307cc191f96efe9e8f05a87096abc20d08845a841889ef78cb06924437f6771"}, ] [[package]] name = "nvidia-cusolver-cu12" -version = "11.4.5.107" +version = "11.6.1.9" description = "CUDA solver native runtime libraries" optional = true python-versions = ">=3" files = [ - {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl", hash = "sha256:8a7ec542f0412294b15072fa7dab71d31334014a69f953004ea7a118206fe0dd"}, - {file = "nvidia_cusolver_cu12-11.4.5.107-py3-none-win_amd64.whl", hash = "sha256:74e0c3a24c78612192a74fcd90dd117f1cf21dea4822e66d89e8ea80e3cd2da5"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:d338f155f174f90724bbde3758b7ac375a70ce8e706d70b018dd3375545fc84e"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260"}, + {file = "nvidia_cusolver_cu12-11.6.1.9-py3-none-win_amd64.whl", hash = "sha256:e77314c9d7b694fcebc84f58989f3aa4fb4cb442f12ca1a9bde50f5e8f6d1b9c"}, ] [package.dependencies] @@ -1694,13 +1729,14 @@ nvidia-nvjitlink-cu12 = "*" [[package]] name = "nvidia-cusparse-cu12" -version = "12.1.0.106" +version = "12.3.1.170" description = "CUSPARSE native runtime libraries" optional = true python-versions = ">=3" files = [ - {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl", hash = "sha256:f3b50f42cf363f86ab21f720998517a659a48131e8d538dc02f8768237bd884c"}, - {file = "nvidia_cusparse_cu12-12.1.0.106-py3-none-win_amd64.whl", hash = "sha256:b798237e81b9719373e8fae8d4f091b70a0cf09d9d85c95a557e11df2d8e9a5a"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_aarch64.whl", hash = "sha256:9d32f62896231ebe0480efd8a7f702e143c98cfaa0e8a76df3386c1ba2b54df3"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1"}, + {file = "nvidia_cusparse_cu12-12.3.1.170-py3-none-win_amd64.whl", hash = "sha256:9bc90fb087bc7b4c15641521f31c0371e9a612fc2ba12c338d3ae032e6b6797f"}, ] [package.dependencies] @@ -1719,36 +1755,35 @@ files = [ [[package]] name = "nvidia-nccl-cu12" -version = "2.20.5" +version = "2.21.5" description = "NVIDIA Collective Communication Library (NCCL) Runtime" optional = true python-versions = ">=3" files = [ - {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_aarch64.whl", hash = "sha256:1fc150d5c3250b170b29410ba682384b14581db722b2531b0d8d33c595f33d01"}, - {file = "nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:057f6bf9685f75215d0c53bf3ac4a10b3e6578351de307abad9e18a99182af56"}, + {file = "nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8579076d30a8c24988834445f8d633c697d42397e92ffc3f63fa26766d25e0a0"}, ] [[package]] name = "nvidia-nvjitlink-cu12" -version = "12.6.68" +version = "12.4.127" description = "Nvidia JIT LTO Library" optional = true python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_aarch64.whl", hash = "sha256:b3fd0779845f68b92063ab1393abab1ed0a23412fc520df79a8190d098b5cd6b"}, - {file = "nvidia_nvjitlink_cu12-12.6.68-py3-none-manylinux2014_x86_64.whl", hash = "sha256:125a6c2a44e96386dda634e13d944e60b07a0402d391a070e8fb4104b34ea1ab"}, - {file = "nvidia_nvjitlink_cu12-12.6.68-py3-none-win_amd64.whl", hash = "sha256:a55744c98d70317c5e23db14866a8cc2b733f7324509e941fc96276f9f37801d"}, + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"}, + {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"}, ] [[package]] name = "nvidia-nvtx-cu12" -version = "12.1.105" +version = "12.4.127" description = "NVIDIA Tools Extension" optional = true python-versions = ">=3" files = [ - {file = "nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl", hash = "sha256:dc21cf308ca5691e7c04d962e213f8a4aa9bbfa23d95412f452254c2caeb09e5"}, - {file = "nvidia_nvtx_cu12-12.1.105-py3-none-win_amd64.whl", hash = "sha256:65f4d98982b31b60026e0e6de73fbdfc09d08a96f4656dd3665ca616a11e1e82"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7959ad635db13edf4fc65c06a6e9f9e55fc2f92596db928d169c0bb031e88ef3"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:781e950d9b9f60d8241ccea575b32f5105a5baf4c2351cab5256a24869f12a1a"}, + {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"}, ] [[package]] @@ -2200,6 +2235,113 @@ files = [ [package.extras] twisted = ["twisted"] +[[package]] +name = "propcache" +version = "0.2.0" +description = "Accelerated property cache" +optional = true +python-versions = ">=3.8" +files = [ + {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c5869b8fd70b81835a6f187c5fdbe67917a04d7e52b6e7cc4e5fe39d55c39d58"}, + {file = "propcache-0.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:952e0d9d07609d9c5be361f33b0d6d650cd2bae393aabb11d9b719364521984b"}, + {file = "propcache-0.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:33ac8f098df0585c0b53009f039dfd913b38c1d2edafed0cedcc0c32a05aa110"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:97e48e8875e6c13909c800fa344cd54cc4b2b0db1d5f911f840458a500fde2c2"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388f3217649d6d59292b722d940d4d2e1e6a7003259eb835724092a1cca0203a"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f571aea50ba5623c308aa146eb650eebf7dbe0fd8c5d946e28343cb3b5aad577"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dfafb44f7bb35c0c06eda6b2ab4bfd58f02729e7c4045e179f9a861b07c9850"}, + {file = "propcache-0.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3ebe9a75be7ab0b7da2464a77bb27febcb4fab46a34f9288f39d74833db7f61"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d2f0d0f976985f85dfb5f3d685697ef769faa6b71993b46b295cdbbd6be8cc37"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:a3dc1a4b165283bd865e8f8cb5f0c64c05001e0718ed06250d8cac9bec115b48"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9e0f07b42d2a50c7dd2d8675d50f7343d998c64008f1da5fef888396b7f84630"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e63e3e1e0271f374ed489ff5ee73d4b6e7c60710e1f76af5f0e1a6117cd26394"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:56bb5c98f058a41bb58eead194b4db8c05b088c93d94d5161728515bd52b052b"}, + {file = "propcache-0.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7665f04d0c7f26ff8bb534e1c65068409bf4687aa2534faf7104d7182debb336"}, + {file = "propcache-0.2.0-cp310-cp310-win32.whl", hash = "sha256:7cf18abf9764746b9c8704774d8b06714bcb0a63641518a3a89c7f85cc02c2ad"}, + {file = "propcache-0.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:cfac69017ef97db2438efb854edf24f5a29fd09a536ff3a992b75990720cdc99"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:63f13bf09cc3336eb04a837490b8f332e0db41da66995c9fd1ba04552e516354"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608cce1da6f2672a56b24a015b42db4ac612ee709f3d29f27a00c943d9e851de"}, + {file = "propcache-0.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:466c219deee4536fbc83c08d09115249db301550625c7fef1c5563a584c9bc87"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc2db02409338bf36590aa985a461b2c96fce91f8e7e0f14c50c5fcc4f229016"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a6ed8db0a556343d566a5c124ee483ae113acc9a557a807d439bcecc44e7dfbb"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:91997d9cb4a325b60d4e3f20967f8eb08dfcb32b22554d5ef78e6fd1dda743a2"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c7dde9e533c0a49d802b4f3f218fa9ad0a1ce21f2c2eb80d5216565202acab4"}, + {file = "propcache-0.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffcad6c564fe6b9b8916c1aefbb37a362deebf9394bd2974e9d84232e3e08504"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:97a58a28bcf63284e8b4d7b460cbee1edaab24634e82059c7b8c09e65284f178"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:945db8ee295d3af9dbdbb698cce9bbc5c59b5c3fe328bbc4387f59a8a35f998d"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:39e104da444a34830751715f45ef9fc537475ba21b7f1f5b0f4d71a3b60d7fe2"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c5ecca8f9bab618340c8e848d340baf68bcd8ad90a8ecd7a4524a81c1764b3db"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:c436130cc779806bdf5d5fae0d848713105472b8566b75ff70048c47d3961c5b"}, + {file = "propcache-0.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:191db28dc6dcd29d1a3e063c3be0b40688ed76434622c53a284e5427565bbd9b"}, + {file = "propcache-0.2.0-cp311-cp311-win32.whl", hash = "sha256:5f2564ec89058ee7c7989a7b719115bdfe2a2fb8e7a4543b8d1c0cc4cf6478c1"}, + {file = "propcache-0.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e2e54267980349b723cff366d1e29b138b9a60fa376664a157a342689553f71"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2ee7606193fb267be4b2e3b32714f2d58cad27217638db98a60f9efb5efeccc2"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:91ee8fc02ca52e24bcb77b234f22afc03288e1dafbb1f88fe24db308910c4ac7"}, + {file = "propcache-0.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2e900bad2a8456d00a113cad8c13343f3b1f327534e3589acc2219729237a2e8"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f52a68c21363c45297aca15561812d542f8fc683c85201df0bebe209e349f793"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e41d67757ff4fbc8ef2af99b338bfb955010444b92929e9e55a6d4dcc3c4f09"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a64e32f8bd94c105cc27f42d3b658902b5bcc947ece3c8fe7bc1b05982f60e89"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55346705687dbd7ef0d77883ab4f6fabc48232f587925bdaf95219bae072491e"}, + {file = "propcache-0.2.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:00181262b17e517df2cd85656fcd6b4e70946fe62cd625b9d74ac9977b64d8d9"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6994984550eaf25dd7fc7bd1b700ff45c894149341725bb4edc67f0ffa94efa4"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:56295eb1e5f3aecd516d91b00cfd8bf3a13991de5a479df9e27dd569ea23959c"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:439e76255daa0f8151d3cb325f6dd4a3e93043e6403e6491813bcaaaa8733887"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f6475a1b2ecb310c98c28d271a30df74f9dd436ee46d09236a6b750a7599ce57"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:3444cdba6628accf384e349014084b1cacd866fbb88433cd9d279d90a54e0b23"}, + {file = "propcache-0.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:4a9d9b4d0a9b38d1c391bb4ad24aa65f306c6f01b512e10a8a34a2dc5675d348"}, + {file = "propcache-0.2.0-cp312-cp312-win32.whl", hash = "sha256:69d3a98eebae99a420d4b28756c8ce6ea5a29291baf2dc9ff9414b42676f61d5"}, + {file = "propcache-0.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:ad9c9b99b05f163109466638bd30ada1722abb01bbb85c739c50b6dc11f92dc3"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ecddc221a077a8132cf7c747d5352a15ed763b674c0448d811f408bf803d9ad7"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0e53cb83fdd61cbd67202735e6a6687a7b491c8742dfc39c9e01e80354956763"}, + {file = "propcache-0.2.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92fe151145a990c22cbccf9ae15cae8ae9eddabfc949a219c9f667877e40853d"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6a21ef516d36909931a2967621eecb256018aeb11fc48656e3257e73e2e247a"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f88a4095e913f98988f5b338c1d4d5d07dbb0b6bad19892fd447484e483ba6b"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a5b3bb545ead161be780ee85a2b54fdf7092815995661947812dde94a40f6fb"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67aeb72e0f482709991aa91345a831d0b707d16b0257e8ef88a2ad246a7280bf"}, + {file = "propcache-0.2.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c997f8c44ec9b9b0bcbf2d422cc00a1d9b9c681f56efa6ca149a941e5560da2"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:2a66df3d4992bc1d725b9aa803e8c5a66c010c65c741ad901e260ece77f58d2f"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:3ebbcf2a07621f29638799828b8d8668c421bfb94c6cb04269130d8de4fb7136"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1235c01ddaa80da8235741e80815ce381c5267f96cc49b1477fdcf8c047ef325"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3947483a381259c06921612550867b37d22e1df6d6d7e8361264b6d037595f44"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d5bed7f9805cc29c780f3aee05de3262ee7ce1f47083cfe9f77471e9d6777e83"}, + {file = "propcache-0.2.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:e4a91d44379f45f5e540971d41e4626dacd7f01004826a18cb048e7da7e96544"}, + {file = "propcache-0.2.0-cp313-cp313-win32.whl", hash = "sha256:f902804113e032e2cdf8c71015651c97af6418363bea8d78dc0911d56c335032"}, + {file = "propcache-0.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:8f188cfcc64fb1266f4684206c9de0e80f54622c3f22a910cbd200478aeae61e"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:53d1bd3f979ed529f0805dd35ddaca330f80a9a6d90bc0121d2ff398f8ed8861"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:83928404adf8fb3d26793665633ea79b7361efa0287dfbd372a7e74311d51ee6"}, + {file = "propcache-0.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:77a86c261679ea5f3896ec060be9dc8e365788248cc1e049632a1be682442063"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:218db2a3c297a3768c11a34812e63b3ac1c3234c3a086def9c0fee50d35add1f"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7735e82e3498c27bcb2d17cb65d62c14f1100b71723b68362872bca7d0913d90"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:20a617c776f520c3875cf4511e0d1db847a076d720714ae35ffe0df3e440be68"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67b69535c870670c9f9b14a75d28baa32221d06f6b6fa6f77a0a13c5a7b0a5b9"}, + {file = "propcache-0.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4569158070180c3855e9c0791c56be3ceeb192defa2cdf6a3f39e54319e56b89"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:db47514ffdbd91ccdc7e6f8407aac4ee94cc871b15b577c1c324236b013ddd04"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_armv7l.whl", hash = "sha256:2a60ad3e2553a74168d275a0ef35e8c0a965448ffbc3b300ab3a5bb9956c2162"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:662dd62358bdeaca0aee5761de8727cfd6861432e3bb828dc2a693aa0471a563"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:25a1f88b471b3bc911d18b935ecb7115dff3a192b6fef46f0bfaf71ff4f12418"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:f60f0ac7005b9f5a6091009b09a419ace1610e163fa5deaba5ce3484341840e7"}, + {file = "propcache-0.2.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:74acd6e291f885678631b7ebc85d2d4aec458dd849b8c841b57ef04047833bed"}, + {file = "propcache-0.2.0-cp38-cp38-win32.whl", hash = "sha256:d9b6ddac6408194e934002a69bcaadbc88c10b5f38fb9307779d1c629181815d"}, + {file = "propcache-0.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:676135dcf3262c9c5081cc8f19ad55c8a64e3f7282a21266d05544450bffc3a5"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:25c8d773a62ce0451b020c7b29a35cfbc05de8b291163a7a0f3b7904f27253e6"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:375a12d7556d462dc64d70475a9ee5982465fbb3d2b364f16b86ba9135793638"}, + {file = "propcache-0.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1ec43d76b9677637a89d6ab86e1fef70d739217fefa208c65352ecf0282be957"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f45eec587dafd4b2d41ac189c2156461ebd0c1082d2fe7013571598abb8505d1"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc092ba439d91df90aea38168e11f75c655880c12782facf5cf9c00f3d42b562"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa1076244f54bb76e65e22cb6910365779d5c3d71d1f18b275f1dfc7b0d71b4d"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:682a7c79a2fbf40f5dbb1eb6bfe2cd865376deeac65acf9beb607505dced9e12"}, + {file = "propcache-0.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e40876731f99b6f3c897b66b803c9e1c07a989b366c6b5b475fafd1f7ba3fb8"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:363ea8cd3c5cb6679f1c2f5f1f9669587361c062e4899fce56758efa928728f8"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:140fbf08ab3588b3468932974a9331aff43c0ab8a2ec2c608b6d7d1756dbb6cb"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e70fac33e8b4ac63dfc4c956fd7d85a0b1139adcfc0d964ce288b7c527537fea"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:b33d7a286c0dc1a15f5fc864cc48ae92a846df287ceac2dd499926c3801054a6"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f6d5749fdd33d90e34c2efb174c7e236829147a2713334d708746e94c4bde40d"}, + {file = "propcache-0.2.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22aa8f2272d81d9317ff5756bb108021a056805ce63dd3630e27d042c8092798"}, + {file = "propcache-0.2.0-cp39-cp39-win32.whl", hash = "sha256:73e4b40ea0eda421b115248d7e79b59214411109a5bc47d0d48e4c73e3b8fcf9"}, + {file = "propcache-0.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:9517d5e9e0731957468c29dbfd0f976736a0e55afaea843726e887f36fe017df"}, + {file = "propcache-0.2.0-py3-none-any.whl", hash = "sha256:2ccc28197af5313706511fab3a8b66dcd6da067a1331372c82ea1cb74285e036"}, + {file = "propcache-0.2.0.tar.gz", hash = "sha256:df81779732feb9d01e5d513fad0122efb3d53bbc75f61b2a4f29a020bc985e70"}, +] + [[package]] name = "protobuf" version = "4.25.5" @@ -2222,32 +2364,33 @@ files = [ [[package]] name = "psutil" -version = "6.0.0" +version = "6.1.0" description = "Cross-platform lib for process and system monitoring in Python." optional = true python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ - {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"}, - {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"}, - {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a3dbfb4de4f18174528d87cc352d1f788b7496991cca33c6996f40c9e3c92c"}, - {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6ec7588fb3ddaec7344a825afe298db83fe01bfaaab39155fa84cf1c0d6b13c3"}, - {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e7c870afcb7d91fdea2b37c24aeb08f98b6d67257a5cb0a8bc3ac68d0f1a68c"}, - {file = "psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35"}, - {file = "psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1"}, - {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"}, - {file = "psutil-6.0.0-cp36-cp36m-win32.whl", hash = "sha256:fc8c9510cde0146432bbdb433322861ee8c3efbf8589865c8bf8d21cb30c4d14"}, - {file = "psutil-6.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:34859b8d8f423b86e4385ff3665d3f4d94be3cdf48221fbe476e883514fdb71c"}, - {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"}, - {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"}, - {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"}, - {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"}, + {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"}, + {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"}, + {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"}, + {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"}, + {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"}, + {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"}, + {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"}, + {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"}, + {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"}, + {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"}, + {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"}, + {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"}, + {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"}, + {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"}, ] [package.extras] -test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"] +test = ["pytest", "pytest-xdist", "setuptools"] [[package]] name = "py-cpuinfo" @@ -2696,18 +2839,19 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "rich" -version = "13.8.1" +version = "13.9.3" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false -python-versions = ">=3.7.0" +python-versions = ">=3.8.0" files = [ - {file = "rich-13.8.1-py3-none-any.whl", hash = "sha256:1760a3c0848469b97b558fc61c85233e3dafb69c7a071b4d60c38099d3cd4c06"}, - {file = "rich-13.8.1.tar.gz", hash = "sha256:8260cda28e3db6bf04d2d1ef4dbc03ba80a824c88b0e7668a0f23126a424844a"}, + {file = "rich-13.9.3-py3-none-any.whl", hash = "sha256:9836f5096eb2172c9e77df411c1b009bace4193d6a481d534fea75ebba758283"}, + {file = "rich-13.9.3.tar.gz", hash = "sha256:bc1e01b899537598cf02579d2b9f4a415104d3fc439313a7a2c165d76557a08e"}, ] [package.dependencies] markdown-it-py = ">=2.2.0" pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.11\""} [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] @@ -3062,13 +3206,13 @@ files = [ [[package]] name = "setuptools" -version = "75.1.0" +version = "75.2.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-75.1.0-py3-none-any.whl", hash = "sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2"}, - {file = "setuptools-75.1.0.tar.gz", hash = "sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538"}, + {file = "setuptools-75.2.0-py3-none-any.whl", hash = "sha256:a7fcb66f68b4d9e8e66b42f9876150a3371558f98fa32222ffaa5bced76406f8"}, + {file = "setuptools-75.2.0.tar.gz", hash = "sha256:753bb6ebf1f465a1912e19ed1d41f403a79173a9acf66a42e7e6aec45c3c16ec"}, ] [package.extras] @@ -3093,13 +3237,13 @@ files = [ [[package]] name = "sympy" -version = "1.13.3" +version = "1.13.1" description = "Computer algebra system (CAS) in Python" optional = true python-versions = ">=3.8" files = [ - {file = "sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73"}, - {file = "sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9"}, + {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"}, + {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"}, ] [package.dependencies] @@ -3121,111 +3265,111 @@ files = [ [[package]] name = "tokenizers" -version = "0.20.0" +version = "0.20.1" description = "" optional = false python-versions = ">=3.7" files = [ - {file = "tokenizers-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:6cff5c5e37c41bc5faa519d6f3df0679e4b37da54ea1f42121719c5e2b4905c0"}, - {file = "tokenizers-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:62a56bf75c27443432456f4ca5ca055befa95e25be8a28141cc495cac8ae4d6d"}, - {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68cc7de6a63f09c4a86909c2597b995aa66e19df852a23aea894929c74369929"}, - {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:053c37ecee482cc958fdee53af3c6534286a86f5d35aac476f7c246830e53ae5"}, - {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3d7074aaabc151a6363fa03db5493fc95b423b2a1874456783989e96d541c7b6"}, - {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a11435780f2acd89e8fefe5e81cecf01776f6edb9b3ac95bcb76baee76b30b90"}, - {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9a81cd2712973b007d84268d45fc3f6f90a79c31dfe7f1925e6732f8d2959987"}, - {file = "tokenizers-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d7dfd796ab9d909f76fb93080e1c7c8309f196ecb316eb130718cd5e34231c69"}, - {file = "tokenizers-0.20.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:8029ad2aa8cb00605c9374566034c1cc1b15130713e0eb5afcef6cface8255c9"}, - {file = "tokenizers-0.20.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ca4d54260ebe97d59dfa9a30baa20d0c4dd9137d99a8801700055c561145c24e"}, - {file = "tokenizers-0.20.0-cp310-none-win32.whl", hash = "sha256:95ee16b57cec11b86a7940174ec5197d506439b0f415ab3859f254b1dffe9df0"}, - {file = "tokenizers-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:0a61a11e93eeadbf02aea082ffc75241c4198e0608bbbac4f65a9026851dcf37"}, - {file = "tokenizers-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6636b798b3c4d6c9b1af1a918bd07c867808e5a21c64324e95318a237e6366c3"}, - {file = "tokenizers-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5ec603e42eaf499ffd58b9258162add948717cf21372458132f14e13a6bc7172"}, - {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cce124264903a8ea6f8f48e1cc7669e5ef638c18bd4ab0a88769d5f92debdf7f"}, - {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07bbeba0231cf8de07aa6b9e33e9779ff103d47042eeeb859a8c432e3292fb98"}, - {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:06c0ca8397b35d38b83a44a9c6929790c1692957d88541df061cb34d82ebbf08"}, - {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ca6557ac3b83d912dfbb1f70ab56bd4b0594043916688e906ede09f42e192401"}, - {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2a5ad94c9e80ac6098328bee2e3264dbced4c6faa34429994d473f795ec58ef4"}, - {file = "tokenizers-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5c7f906ee6bec30a9dc20268a8b80f3b9584de1c9f051671cb057dc6ce28f6"}, - {file = "tokenizers-0.20.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:31e087e9ee1b8f075b002bfee257e858dc695f955b43903e1bb4aa9f170e37fe"}, - {file = "tokenizers-0.20.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c3124fb6f3346cb3d8d775375d3b429bf4dcfc24f739822702009d20a4297990"}, - {file = "tokenizers-0.20.0-cp311-none-win32.whl", hash = "sha256:a4bb8b40ba9eefa621fdcabf04a74aa6038ae3be0c614c6458bd91a4697a452f"}, - {file = "tokenizers-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:2b709d371f1fe60a28ef0c5c67815952d455ca7f34dbe7197eaaed3cc54b658e"}, - {file = "tokenizers-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:15c81a17d0d66f4987c6ca16f4bea7ec253b8c7ed1bb00fdc5d038b1bb56e714"}, - {file = "tokenizers-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6a531cdf1fb6dc41c984c785a3b299cb0586de0b35683842a3afbb1e5207f910"}, - {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:06caabeb4587f8404e0cd9d40f458e9cba3e815c8155a38e579a74ff3e2a4301"}, - {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8768f964f23f5b9f50546c0369c75ab3262de926983888bbe8b98be05392a79c"}, - {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:626403860152c816f97b649fd279bd622c3d417678c93b4b1a8909b6380b69a8"}, - {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c1b88fa9e5ff062326f4bf82681da5a96fca7104d921a6bd7b1e6fcf224af26"}, - {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d7e559436a07dc547f22ce1101f26d8b2fad387e28ec8e7e1e3b11695d681d8"}, - {file = "tokenizers-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e48afb75e50449848964e4a67b0da01261dd3aa8df8daecf10db8fd7f5b076eb"}, - {file = "tokenizers-0.20.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:baf5d0e1ff44710a95eefc196dd87666ffc609fd447c5e5b68272a7c3d342a1d"}, - {file = "tokenizers-0.20.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e5e56df0e8ed23ba60ae3848c3f069a0710c4b197218fe4f89e27eba38510768"}, - {file = "tokenizers-0.20.0-cp312-none-win32.whl", hash = "sha256:ec53e5ecc142a82432f9c6c677dbbe5a2bfee92b8abf409a9ecb0d425ee0ce75"}, - {file = "tokenizers-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:f18661ece72e39c0dfaa174d6223248a15b457dbd4b0fc07809b8e6d3ca1a234"}, - {file = "tokenizers-0.20.0-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:f7065b1084d8d1a03dc89d9aad69bcbc8415d4bc123c367063eb32958cd85054"}, - {file = "tokenizers-0.20.0-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e5d4069e4714e3f7ba0a4d3d44f9d84a432cd4e4aa85c3d7dd1f51440f12e4a1"}, - {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:799b808529e54b7e1a36350bda2aeb470e8390e484d3e98c10395cee61d4e3c6"}, - {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f9baa027cc8a281ad5f7725a93c204d7a46986f88edbe8ef7357f40a23fb9c7"}, - {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:010ec7f3f7a96adc4c2a34a3ada41fa14b4b936b5628b4ff7b33791258646c6b"}, - {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98d88f06155335b14fd78e32ee28ca5b2eb30fced4614e06eb14ae5f7fba24ed"}, - {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e13eb000ef540c2280758d1b9cfa5fe424b0424ae4458f440e6340a4f18b2638"}, - {file = "tokenizers-0.20.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fab3cf066ff426f7e6d70435dc28a9ff01b2747be83810e397cba106f39430b0"}, - {file = "tokenizers-0.20.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:39fa3761b30a89368f322e5daf4130dce8495b79ad831f370449cdacfb0c0d37"}, - {file = "tokenizers-0.20.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c8da0fba4d179ddf2607821575998df3c294aa59aa8df5a6646dc64bc7352bce"}, - {file = "tokenizers-0.20.0-cp37-none-win32.whl", hash = "sha256:fada996d6da8cf213f6e3c91c12297ad4f6cdf7a85c2fadcd05ec32fa6846fcd"}, - {file = "tokenizers-0.20.0-cp37-none-win_amd64.whl", hash = "sha256:7d29aad702279e0760c265fcae832e89349078e3418dd329732d4503259fd6bd"}, - {file = "tokenizers-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:099c68207f3ef0227ecb6f80ab98ea74de559f7b124adc7b17778af0250ee90a"}, - {file = "tokenizers-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:68012d8a8cddb2eab3880870d7e2086cb359c7f7a2b03f5795044f5abff4e850"}, - {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9253bdd209c6aee168deca7d0e780581bf303e0058f268f9bb06859379de19b6"}, - {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8f868600ddbcb0545905ed075eb7218a0756bf6c09dae7528ea2f8436ebd2c93"}, - {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9a9643d9c8c5f99b6aba43fd10034f77cc6c22c31f496d2f0ee183047d948fa0"}, - {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c375c6a889aeab44734028bc65cc070acf93ccb0f9368be42b67a98e1063d3f6"}, - {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e359f852328e254f070bbd09a19a568421d23388f04aad9f2fb7da7704c7228d"}, - {file = "tokenizers-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d98b01a309d4387f3b1c1dd68a8b8136af50376cf146c1b7e8d8ead217a5be4b"}, - {file = "tokenizers-0.20.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:459f7537119554c2899067dec1ac74a00d02beef6558f4ee2e99513bf6d568af"}, - {file = "tokenizers-0.20.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:392b87ec89452628c045c9f2a88bc2a827f4c79e7d84bc3b72752b74c2581f70"}, - {file = "tokenizers-0.20.0-cp38-none-win32.whl", hash = "sha256:55a393f893d2ed4dd95a1553c2e42d4d4086878266f437b03590d3f81984c4fe"}, - {file = "tokenizers-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:30ffe33c5c2f2aab8e9a3340d0110dd9f7ace7eec7362e20a697802306bd8068"}, - {file = "tokenizers-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:aa2d4a6fed2a7e3f860c7fc9d48764bb30f2649d83915d66150d6340e06742b8"}, - {file = "tokenizers-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b5ef0f814084a897e9071fc4a868595f018c5c92889197bdc4bf19018769b148"}, - {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc1e1b791e8c3bf4c4f265f180dadaff1c957bf27129e16fdd5e5d43c2d3762c"}, - {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2b69e55e481459c07885263743a0d3c18d52db19bae8226a19bcca4aaa213fff"}, - {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4806b4d82e27a2512bc23057b2986bc8b85824914286975b84d8105ff40d03d9"}, - {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9859e9ef13adf5a473ccab39d31bff9c550606ae3c784bf772b40f615742a24f"}, - {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ef703efedf4c20488a8eb17637b55973745b27997ff87bad88ed499b397d1144"}, - {file = "tokenizers-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6eec0061bab94b1841ab87d10831fdf1b48ebaed60e6d66d66dbe1d873f92bf5"}, - {file = "tokenizers-0.20.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:980f3d0d7e73f845b69087f29a63c11c7eb924c4ad6b358da60f3db4cf24bdb4"}, - {file = "tokenizers-0.20.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7c157550a2f3851b29d7fdc9dc059fcf81ff0c0fc49a1e5173a89d533ed043fa"}, - {file = "tokenizers-0.20.0-cp39-none-win32.whl", hash = "sha256:8a3d2f4d08608ec4f9895ec25b4b36a97f05812543190a5f2c3cd19e8f041e5a"}, - {file = "tokenizers-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:d90188d12afd0c75e537f9a1d92f9c7375650188ee4f48fdc76f9e38afbd2251"}, - {file = "tokenizers-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:d68e15f1815357b059ec266062340c343ea7f98f7f330602df81ffa3474b6122"}, - {file = "tokenizers-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:23f9ecec637b9bc80da5f703808d29ed5329e56b5aa8d791d1088014f48afadc"}, - {file = "tokenizers-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f830b318ee599e3d0665b3e325f85bc75ee2d2ca6285f52e439dc22b64691580"}, - {file = "tokenizers-0.20.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3dc750def789cb1de1b5a37657919545e1d9ffa667658b3fa9cb7862407a1b8"}, - {file = "tokenizers-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e26e6c755ae884c2ea6135cd215bdd0fccafe4ee62405014b8c3cd19954e3ab9"}, - {file = "tokenizers-0.20.0-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:a1158c7174f427182e08baa2a8ded2940f2b4a3e94969a85cc9cfd16004cbcea"}, - {file = "tokenizers-0.20.0-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:6324826287a3fc198898d3dcf758fe4a8479e42d6039f4c59e2cedd3cf92f64e"}, - {file = "tokenizers-0.20.0-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7d8653149405bb0c16feaf9cfee327fdb6aaef9dc2998349fec686f35e81c4e2"}, - {file = "tokenizers-0.20.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8a2dc1e402a155e97309287ca085c80eb1b7fab8ae91527d3b729181639fa51"}, - {file = "tokenizers-0.20.0-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07bef67b20aa6e5f7868c42c7c5eae4d24f856274a464ae62e47a0f2cccec3da"}, - {file = "tokenizers-0.20.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da06e397182ff53789c506c7833220c192952c57e1581a53f503d8d953e2d67e"}, - {file = "tokenizers-0.20.0-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:302f7e11a14814028b7fc88c45a41f1bbe9b5b35fd76d6869558d1d1809baa43"}, - {file = "tokenizers-0.20.0-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:055ec46e807b875589dfbe3d9259f9a6ee43394fb553b03b3d1e9541662dbf25"}, - {file = "tokenizers-0.20.0-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e3144b8acebfa6ae062e8f45f7ed52e4b50fb6c62f93afc8871b525ab9fdcab3"}, - {file = "tokenizers-0.20.0-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b52aa3fd14b2a07588c00a19f66511cff5cca8f7266ca3edcdd17f3512ad159f"}, - {file = "tokenizers-0.20.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b8cf52779ffc5d4d63a0170fbeb512372bad0dd014ce92bbb9149756c831124"}, - {file = "tokenizers-0.20.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:983a45dd11a876124378dae71d6d9761822199b68a4c73f32873d8cdaf326a5b"}, - {file = "tokenizers-0.20.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df6b819c9a19831ebec581e71a7686a54ab45d90faf3842269a10c11d746de0c"}, - {file = "tokenizers-0.20.0-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e738cfd80795fcafcef89c5731c84b05638a4ab3f412f97d5ed7765466576eb1"}, - {file = "tokenizers-0.20.0-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:c8842c7be2fadb9c9edcee233b1b7fe7ade406c99b0973f07439985c1c1d0683"}, - {file = "tokenizers-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e47a82355511c373a4a430c4909dc1e518e00031207b1fec536c49127388886b"}, - {file = "tokenizers-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:9afbf359004551179a5db19424180c81276682773cff2c5d002f6eaaffe17230"}, - {file = "tokenizers-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a07eaa8799a92e6af6f472c21a75bf71575de2af3c0284120b7a09297c0de2f3"}, - {file = "tokenizers-0.20.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0994b2e5fc53a301071806bc4303e4bc3bdc3f490e92a21338146a36746b0872"}, - {file = "tokenizers-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b6466e0355b603d10e3cc3d282d350b646341b601e50969464a54939f9848d0"}, - {file = "tokenizers-0.20.0-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1e86594c2a433cb1ea09cfbe596454448c566e57ee8905bd557e489d93e89986"}, - {file = "tokenizers-0.20.0-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:3e14cdef1efa96ecead6ea64a891828432c3ebba128bdc0596e3059fea104ef3"}, - {file = "tokenizers-0.20.0.tar.gz", hash = "sha256:39d7acc43f564c274085cafcd1dae9d36f332456de1a31970296a6b8da4eac8d"}, + {file = "tokenizers-0.20.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:439261da7c0a5c88bda97acb284d49fbdaf67e9d3b623c0bfd107512d22787a9"}, + {file = "tokenizers-0.20.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03dae629d99068b1ea5416d50de0fea13008f04129cc79af77a2a6392792d93c"}, + {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b61f561f329ffe4b28367798b89d60c4abf3f815d37413b6352bc6412a359867"}, + {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec870fce1ee5248a10be69f7a8408a234d6f2109f8ea827b4f7ecdbf08c9fd15"}, + {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d388d1ea8b7447da784e32e3b86a75cce55887e3b22b31c19d0b186b1c677800"}, + {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:299c85c1d21135bc01542237979bf25c32efa0d66595dd0069ae259b97fb2dbe"}, + {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e96f6c14c9752bb82145636b614d5a78e9cde95edfbe0a85dad0dd5ddd6ec95c"}, + {file = "tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc9e95ad49c932b80abfbfeaf63b155761e695ad9f8a58c52a47d962d76e310f"}, + {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:f22dee205329a636148c325921c73cf3e412e87d31f4d9c3153b302a0200057b"}, + {file = "tokenizers-0.20.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2ffd9a8895575ac636d44500c66dffaef133823b6b25067604fa73bbc5ec09d"}, + {file = "tokenizers-0.20.1-cp310-none-win32.whl", hash = "sha256:2847843c53f445e0f19ea842a4e48b89dd0db4e62ba6e1e47a2749d6ec11f50d"}, + {file = "tokenizers-0.20.1-cp310-none-win_amd64.whl", hash = "sha256:f9aa93eacd865f2798b9e62f7ce4533cfff4f5fbd50c02926a78e81c74e432cd"}, + {file = "tokenizers-0.20.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4a717dcb08f2dabbf27ae4b6b20cbbb2ad7ed78ce05a829fae100ff4b3c7ff15"}, + {file = "tokenizers-0.20.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f84dad1ff1863c648d80628b1b55353d16303431283e4efbb6ab1af56a75832"}, + {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:929c8f3afa16a5130a81ab5079c589226273ec618949cce79b46d96e59a84f61"}, + {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d10766473954397e2d370f215ebed1cc46dcf6fd3906a2a116aa1d6219bfedc3"}, + {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9300fac73ddc7e4b0330acbdda4efaabf74929a4a61e119a32a181f534a11b47"}, + {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ecaf7b0e39caeb1aa6dd6e0975c405716c82c1312b55ac4f716ef563a906969"}, + {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5170be9ec942f3d1d317817ced8d749b3e1202670865e4fd465e35d8c259de83"}, + {file = "tokenizers-0.20.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ef3f1ae08fa9aea5891cbd69df29913e11d3841798e0bfb1ff78b78e4e7ea0a4"}, + {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ee86d4095d3542d73579e953c2e5e07d9321af2ffea6ecc097d16d538a2dea16"}, + {file = "tokenizers-0.20.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:86dcd08da163912e17b27bbaba5efdc71b4fbffb841530fdb74c5707f3c49216"}, + {file = "tokenizers-0.20.1-cp311-none-win32.whl", hash = "sha256:9af2dc4ee97d037bc6b05fa4429ddc87532c706316c5e11ce2f0596dfcfa77af"}, + {file = "tokenizers-0.20.1-cp311-none-win_amd64.whl", hash = "sha256:899152a78b095559c287b4c6d0099469573bb2055347bb8154db106651296f39"}, + {file = "tokenizers-0.20.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:407ab666b38e02228fa785e81f7cf79ef929f104bcccf68a64525a54a93ceac9"}, + {file = "tokenizers-0.20.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2f13a2d16032ebc8bd812eb8099b035ac65887d8f0c207261472803b9633cf3e"}, + {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e98eee4dca22849fbb56a80acaa899eec5b72055d79637dd6aa15d5e4b8628c9"}, + {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:47c1bcdd61e61136087459cb9e0b069ff23b5568b008265e5cbc927eae3387ce"}, + {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:128c1110e950534426e2274837fc06b118ab5f2fa61c3436e60e0aada0ccfd67"}, + {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2e2d47a819d2954f2c1cd0ad51bb58ffac6f53a872d5d82d65d79bf76b9896d"}, + {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bdd67a0e3503a9a7cf8bc5a4a49cdde5fa5bada09a51e4c7e1c73900297539bd"}, + {file = "tokenizers-0.20.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689b93d2e26d04da337ac407acec8b5d081d8d135e3e5066a88edd5bdb5aff89"}, + {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0c6a796ddcd9a19ad13cf146997cd5895a421fe6aec8fd970d69f9117bddb45c"}, + {file = "tokenizers-0.20.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3ea919687aa7001a8ff1ba36ac64f165c4e89035f57998fa6cedcfd877be619d"}, + {file = "tokenizers-0.20.1-cp312-none-win32.whl", hash = "sha256:6d3ac5c1f48358ffe20086bf065e843c0d0a9fce0d7f0f45d5f2f9fba3609ca5"}, + {file = "tokenizers-0.20.1-cp312-none-win_amd64.whl", hash = "sha256:b0874481aea54a178f2bccc45aa2d0c99cd3f79143a0948af6a9a21dcc49173b"}, + {file = "tokenizers-0.20.1-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:96af92e833bd44760fb17f23f402e07a66339c1dcbe17d79a9b55bb0cc4f038e"}, + {file = "tokenizers-0.20.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:65f34e5b731a262dfa562820818533c38ce32a45864437f3d9c82f26c139ca7f"}, + {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17f98fccb5c12ab1ce1f471731a9cd86df5d4bd2cf2880c5a66b229802d96145"}, + {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b8c0fc3542cf9370bf92c932eb71bdeb33d2d4aeeb4126d9fd567b60bd04cb30"}, + {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4b39356df4575d37f9b187bb623aab5abb7b62c8cb702867a1768002f814800c"}, + {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfdad27b0e50544f6b838895a373db6114b85112ba5c0cefadffa78d6daae563"}, + {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:094663dd0e85ee2e573126918747bdb40044a848fde388efb5b09d57bc74c680"}, + {file = "tokenizers-0.20.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:14e4cf033a2aa207d7ac790e91adca598b679999710a632c4a494aab0fc3a1b2"}, + {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:9310951c92c9fb91660de0c19a923c432f110dbfad1a2d429fbc44fa956bf64f"}, + {file = "tokenizers-0.20.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05e41e302c315bd2ed86c02e917bf03a6cf7d2f652c9cee1a0eb0d0f1ca0d32c"}, + {file = "tokenizers-0.20.1-cp37-none-win32.whl", hash = "sha256:212231ab7dfcdc879baf4892ca87c726259fa7c887e1688e3f3cead384d8c305"}, + {file = "tokenizers-0.20.1-cp37-none-win_amd64.whl", hash = "sha256:896195eb9dfdc85c8c052e29947169c1fcbe75a254c4b5792cdbd451587bce85"}, + {file = "tokenizers-0.20.1-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:741fb22788482d09d68e73ece1495cfc6d9b29a06c37b3df90564a9cfa688e6d"}, + {file = "tokenizers-0.20.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:10be14ebd8082086a342d969e17fc2d6edc856c59dbdbddd25f158fa40eaf043"}, + {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:514cf279b22fa1ae0bc08e143458c74ad3b56cd078b319464959685a35c53d5e"}, + {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a647c5b7cb896d6430cf3e01b4e9a2d77f719c84cefcef825d404830c2071da2"}, + {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7cdf379219e1e1dd432091058dab325a2e6235ebb23e0aec8d0508567c90cd01"}, + {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ba72260449e16c4c2f6f3252823b059fbf2d31b32617e582003f2b18b415c39"}, + {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:910b96ed87316e4277b23c7bcaf667ce849c7cc379a453fa179e7e09290eeb25"}, + {file = "tokenizers-0.20.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e53975a6694428a0586534cc1354b2408d4e010a3103117f617cbb550299797c"}, + {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:07c4b7be58da142b0730cc4e5fd66bb7bf6f57f4986ddda73833cd39efef8a01"}, + {file = "tokenizers-0.20.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b605c540753e62199bf15cf69c333e934077ef2350262af2ccada46026f83d1c"}, + {file = "tokenizers-0.20.1-cp38-none-win32.whl", hash = "sha256:88b3bc76ab4db1ab95ead623d49c95205411e26302cf9f74203e762ac7e85685"}, + {file = "tokenizers-0.20.1-cp38-none-win_amd64.whl", hash = "sha256:d412a74cf5b3f68a90c615611a5aa4478bb303d1c65961d22db45001df68afcb"}, + {file = "tokenizers-0.20.1-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:a25dcb2f41a0a6aac31999e6c96a75e9152fa0127af8ece46c2f784f23b8197a"}, + {file = "tokenizers-0.20.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a12c3cebb8c92e9c35a23ab10d3852aee522f385c28d0b4fe48c0b7527d59762"}, + {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:02e18da58cf115b7c40de973609c35bde95856012ba42a41ee919c77935af251"}, + {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f326a1ac51ae909b9760e34671c26cd0dfe15662f447302a9d5bb2d872bab8ab"}, + {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0b4872647ea6f25224e2833b044b0b19084e39400e8ead3cfe751238b0802140"}, + {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce6238a3311bb8e4c15b12600927d35c267b92a52c881ef5717a900ca14793f7"}, + {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57b7a8880b208866508b06ce365dc631e7a2472a3faa24daa430d046fb56c885"}, + {file = "tokenizers-0.20.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a908c69c2897a68f412aa05ba38bfa87a02980df70f5a72fa8490479308b1f2d"}, + {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:da1001aa46f4490099c82e2facc4fbc06a6a32bf7de3918ba798010954b775e0"}, + {file = "tokenizers-0.20.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:42c097390e2f0ed0a5c5d569e6669dd4e9fff7b31c6a5ce6e9c66a61687197de"}, + {file = "tokenizers-0.20.1-cp39-none-win32.whl", hash = "sha256:3d4d218573a3d8b121a1f8c801029d70444ffb6d8f129d4cca1c7b672ee4a24c"}, + {file = "tokenizers-0.20.1-cp39-none-win_amd64.whl", hash = "sha256:37d1e6f616c84fceefa7c6484a01df05caf1e207669121c66213cb5b2911d653"}, + {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48689da7a395df41114f516208d6550e3e905e1239cc5ad386686d9358e9cef0"}, + {file = "tokenizers-0.20.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:712f90ea33f9bd2586b4a90d697c26d56d0a22fd3c91104c5858c4b5b6489a79"}, + {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:359eceb6a620c965988fc559cebc0a98db26713758ec4df43fb76d41486a8ed5"}, + {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0d3caf244ce89d24c87545aafc3448be15870096e796c703a0d68547187192e1"}, + {file = "tokenizers-0.20.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03b03cf8b9a32254b1bf8a305fb95c6daf1baae0c1f93b27f2b08c9759f41dee"}, + {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:218e5a3561561ea0f0ef1559c6d95b825308dbec23fb55b70b92589e7ff2e1e8"}, + {file = "tokenizers-0.20.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f40df5e0294a95131cc5f0e0eb91fe86d88837abfbee46b9b3610b09860195a7"}, + {file = "tokenizers-0.20.1-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:08aaa0d72bb65058e8c4b0455f61b840b156c557e2aca57627056624c3a93976"}, + {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:998700177b45f70afeb206ad22c08d9e5f3a80639dae1032bf41e8cbc4dada4b"}, + {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62f7fbd3c2c38b179556d879edae442b45f68312019c3a6013e56c3947a4e648"}, + {file = "tokenizers-0.20.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31e87fca4f6bbf5cc67481b562147fe932f73d5602734de7dd18a8f2eee9c6dd"}, + {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:956f21d359ae29dd51ca5726d2c9a44ffafa041c623f5aa33749da87cfa809b9"}, + {file = "tokenizers-0.20.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:1fbbaf17a393c78d8aedb6a334097c91cb4119a9ced4764ab8cfdc8d254dc9f9"}, + {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:ebe63e31f9c1a970c53866d814e35ec2ec26fda03097c486f82f3891cee60830"}, + {file = "tokenizers-0.20.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:81970b80b8ac126910295f8aab2d7ef962009ea39e0d86d304769493f69aaa1e"}, + {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e35e76f9337ed6c31be386e75d4925ea807055acf18ca1a9b0eec03d8fe23"}, + {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd28a8614f5c82a54ab2463554e84ad79526c5184cf4573bbac2efbbbcead457"}, + {file = "tokenizers-0.20.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9041ee665d0fa7f5c4ccf0f81f5e6b7087f797f85b143c094126fc2611fec9d0"}, + {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:62eb9daea2a2c06bcd8113a5824af8ef8ee7405d3a71123ba4d52c79bb3d9f1a"}, + {file = "tokenizers-0.20.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f861889707b54a9ab1204030b65fd6c22bdd4a95205deec7994dc22a8baa2ea4"}, + {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:89d5c337d74ea6e5e7dc8af124cf177be843bbb9ca6e58c01f75ea103c12c8a9"}, + {file = "tokenizers-0.20.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:0b7f515c83397e73292accdbbbedc62264e070bae9682f06061e2ddce67cacaf"}, + {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e0305fc1ec6b1e5052d30d9c1d5c807081a7bd0cae46a33d03117082e91908c"}, + {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5dc611e6ac0fa00a41de19c3bf6391a05ea201d2d22b757d63f5491ec0e67faa"}, + {file = "tokenizers-0.20.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5ffe0d7f7bfcfa3b2585776ecf11da2e01c317027c8573c78ebcb8985279e23"}, + {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:e7edb8ec12c100d5458d15b1e47c0eb30ad606a05641f19af7563bc3d1608c14"}, + {file = "tokenizers-0.20.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:de291633fb9303555793cc544d4a86e858da529b7d0b752bcaf721ae1d74b2c9"}, + {file = "tokenizers-0.20.1.tar.gz", hash = "sha256:84edcc7cdeeee45ceedb65d518fffb77aec69311c9c8e30f77ad84da3025f002"}, ] [package.dependencies] @@ -3238,42 +3382,39 @@ testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests", "ruff"] [[package]] name = "tomli" -version = "2.0.1" +version = "2.0.2" description = "A lil' TOML parser" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, + {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, + {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, ] [[package]] name = "torch" -version = "2.4.1" +version = "2.5.0" description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" optional = true python-versions = ">=3.8.0" files = [ - {file = "torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:362f82e23a4cd46341daabb76fba08f04cd646df9bfaf5da50af97cb60ca4971"}, - {file = "torch-2.4.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:e8ac1985c3ff0f60d85b991954cfc2cc25f79c84545aead422763148ed2759e3"}, - {file = "torch-2.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:91e326e2ccfb1496e3bee58f70ef605aeb27bd26be07ba64f37dcaac3d070ada"}, - {file = "torch-2.4.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:d36a8ef100f5bff3e9c3cea934b9e0d7ea277cb8210c7152d34a9a6c5830eadd"}, - {file = "torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:0b5f88afdfa05a335d80351e3cea57d38e578c8689f751d35e0ff36bce872113"}, - {file = "torch-2.4.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:ef503165f2341942bfdf2bd520152f19540d0c0e34961232f134dc59ad435be8"}, - {file = "torch-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:092e7c2280c860eff762ac08c4bdcd53d701677851670695e0c22d6d345b269c"}, - {file = "torch-2.4.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ddddbd8b066e743934a4200b3d54267a46db02106876d21cf31f7da7a96f98ea"}, - {file = "torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:fdc4fe11db3eb93c1115d3e973a27ac7c1a8318af8934ffa36b0370efe28e042"}, - {file = "torch-2.4.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:18835374f599207a9e82c262153c20ddf42ea49bc76b6eadad8e5f49729f6e4d"}, - {file = "torch-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:ebea70ff30544fc021d441ce6b219a88b67524f01170b1c538d7d3ebb5e7f56c"}, - {file = "torch-2.4.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:72b484d5b6cec1a735bf3fa5a1c4883d01748698c5e9cfdbeb4ffab7c7987e0d"}, - {file = "torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:c99e1db4bf0c5347107845d715b4aa1097e601bdc36343d758963055e9599d93"}, - {file = "torch-2.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:b57f07e92858db78c5b72857b4f0b33a65b00dc5d68e7948a8494b0314efb880"}, - {file = "torch-2.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:f18197f3f7c15cde2115892b64f17c80dbf01ed72b008020e7da339902742cf6"}, - {file = "torch-2.4.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:5fc1d4d7ed265ef853579caf272686d1ed87cebdcd04f2a498f800ffc53dab71"}, - {file = "torch-2.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:40f6d3fe3bae74efcf08cb7f8295eaddd8a838ce89e9d26929d4edd6d5e4329d"}, - {file = "torch-2.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:c9299c16c9743001ecef515536ac45900247f4338ecdf70746f2461f9e4831db"}, - {file = "torch-2.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:6bce130f2cd2d52ba4e2c6ada461808de7e5eccbac692525337cfb4c19421846"}, - {file = "torch-2.4.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:a38de2803ee6050309aac032676536c3d3b6a9804248537e38e098d0e14817ec"}, + {file = "torch-2.5.0-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:7f179373a047b947dec448243f4e6598a1c960fa3bb978a9a7eecd529fbc363f"}, + {file = "torch-2.5.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:15fbc95e38d330e5b0ef1593b7bc0a19f30e5bdad76895a5cffa1a6a044235e9"}, + {file = "torch-2.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:f499212f1cffea5d587e5f06144630ed9aa9c399bba12ec8905798d833bd1404"}, + {file = "torch-2.5.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:c54db1fade17287aabbeed685d8e8ab3a56fea9dd8d46e71ced2da367f09a49f"}, + {file = "torch-2.5.0-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:499a68a756d3b30d10f7e0f6214dc3767b130b797265db3b1c02e9094e2a07be"}, + {file = "torch-2.5.0-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:9f3df8138a1126a851440b7d5a4869bfb7c9cc43563d64fd9d96d0465b581024"}, + {file = "torch-2.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:b81da3bdb58c9de29d0e1361e52f12fcf10a89673f17a11a5c6c7da1cb1a8376"}, + {file = "torch-2.5.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:ba135923295d564355326dc409b6b7f5bd6edc80f764cdaef1fb0a1b23ff2f9c"}, + {file = "torch-2.5.0-cp312-cp312-manylinux1_x86_64.whl", hash = "sha256:2dd40c885a05ef7fe29356cca81be1435a893096ceb984441d6e2c27aff8c6f4"}, + {file = "torch-2.5.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:bc52d603d87fe1da24439c0d5fdbbb14e0ae4874451d53f0120ffb1f6c192727"}, + {file = "torch-2.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:ea718746469246cc63b3353afd75698a288344adb55e29b7f814a5d3c0a7c78d"}, + {file = "torch-2.5.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:6de1fd253e27e7f01f05cd7c37929ae521ca23ca4620cfc7c485299941679112"}, + {file = "torch-2.5.0-cp313-cp313-manylinux1_x86_64.whl", hash = "sha256:83dcf518685db20912b71fc49cbddcc8849438cdb0e9dcc919b02a849e2cd9e8"}, + {file = "torch-2.5.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:65e0a60894435608334d68c8811e55fd8f73e5bf8ee6f9ccedb0064486a7b418"}, + {file = "torch-2.5.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:38c21ff1bd39f076d72ab06e3c88c2ea6874f2e6f235c9450816b6c8e7627094"}, + {file = "torch-2.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:ce4baeba9804da5a346e210b3b70826f5811330c343e4fe1582200359ee77fe5"}, + {file = "torch-2.5.0-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:03e53f577a96e4d41aca472da8faa40e55df89d2273664af390ce1f570e885bd"}, ] [package.dependencies] @@ -3281,25 +3422,26 @@ filelock = "*" fsspec = "*" jinja2 = "*" networkx = "*" -nvidia-cublas-cu12 = {version = "12.1.3.1", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-cupti-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-nvrtc-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cuda-runtime-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cublas-cu12 = {version = "12.4.5.8", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-cupti-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-nvrtc-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cuda-runtime-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} nvidia-cudnn-cu12 = {version = "9.1.0.70", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cufft-cu12 = {version = "11.0.2.54", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-curand-cu12 = {version = "10.3.2.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cusolver-cu12 = {version = "11.4.5.107", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-cusparse-cu12 = {version = "12.1.0.106", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nccl-cu12 = {version = "2.20.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -nvidia-nvtx-cu12 = {version = "12.1.105", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} -setuptools = "*" -sympy = "*" -triton = {version = "3.0.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""} +nvidia-cufft-cu12 = {version = "11.2.1.3", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-curand-cu12 = {version = "10.3.5.147", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusolver-cu12 = {version = "11.6.1.9", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-cusparse-cu12 = {version = "12.3.1.170", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nccl-cu12 = {version = "2.21.5", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvjitlink-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +nvidia-nvtx-cu12 = {version = "12.4.127", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\""} +setuptools = {version = "*", markers = "python_version >= \"3.12\""} +sympy = {version = "1.13.1", markers = "python_version >= \"3.9\""} +triton = {version = "3.1.0", markers = "platform_system == \"Linux\" and platform_machine == \"x86_64\" and python_version < \"3.13\""} typing-extensions = ">=4.8.0" [package.extras] opt-einsum = ["opt-einsum (>=3.3)"] -optree = ["optree (>=0.11.0)"] +optree = ["optree (>=0.12.0)"] [[package]] name = "tqdm" @@ -3323,13 +3465,13 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "4.45.0" +version = "4.45.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.8.0" files = [ - {file = "transformers-4.45.0-py3-none-any.whl", hash = "sha256:f04a82926676056afb3bbf4df7d76ceb1fc2b2746247a87f3f9be4674adc95d7"}, - {file = "transformers-4.45.0.tar.gz", hash = "sha256:29629f87965acc7b15e458a24580832d85da18ddc119410211747fe778c200ce"}, + {file = "transformers-4.45.2-py3-none-any.whl", hash = "sha256:c551b33660cfc815bae1f9f097ecfd1e65be623f13c6ee0dda372bd881460210"}, + {file = "transformers-4.45.2.tar.gz", hash = "sha256:72bc390f6b203892561f05f86bbfaa0e234aab8e927a83e62b9d92ea7e3ae101"}, ] [package.dependencies] @@ -3392,16 +3534,16 @@ vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "triton" -version = "3.0.0" +version = "3.1.0" description = "A language and compiler for custom Deep Learning operations" optional = true python-versions = "*" files = [ - {file = "triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e1efef76935b2febc365bfadf74bcb65a6f959a9872e5bddf44cc9e0adce1e1a"}, - {file = "triton-3.0.0-1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ce8520437c602fb633f1324cc3871c47bee3b67acf9756c1a66309b60e3216c"}, - {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, - {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, - {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b0dd10a925263abbe9fa37dcde67a5e9b2383fc269fdf59f5657cac38c5d1d8"}, + {file = "triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f34f6e7885d1bf0eaaf7ba875a5f0ce6f3c13ba98f9503651c1e6dc6757ed5c"}, + {file = "triton-3.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8182f42fd8080a7d39d666814fa36c5e30cc00ea7eeeb1a2983dbb4c99a0fdc"}, + {file = "triton-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dadaca7fc24de34e180271b5cf864c16755702e9f63a16f62df714a8099126a"}, + {file = "triton-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aafa9a20cd0d9fee523cd4504aa7131807a864cd77dcf6efe7e981f18b8c6c11"}, ] [package.dependencies] @@ -3698,108 +3840,99 @@ files = [ [[package]] name = "yarl" -version = "1.12.1" +version = "1.16.0" description = "Yet another URL library" optional = true -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "yarl-1.12.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:64c5b0f2b937fe40d0967516eee5504b23cb247b8b7ffeba7213a467d9646fdc"}, - {file = "yarl-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2e430ac432f969ef21770645743611c1618362309e3ad7cab45acd1ad1a540ff"}, - {file = "yarl-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3e26e64f42bce5ddf9002092b2c37b13071c2e6413d5c05f9fa9de58ed2f7749"}, - {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0103c52f8dfe5d573c856322149ddcd6d28f51b4d4a3ee5c4b3c1b0a05c3d034"}, - {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b63465b53baeaf2122a337d4ab57d6bbdd09fcadceb17a974cfa8a0300ad9c67"}, - {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17d4dc4ff47893a06737b8788ed2ba2f5ac4e8bb40281c8603920f7d011d5bdd"}, - {file = "yarl-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a8b54949267bd5704324397efe9fbb6aa306466dee067550964e994d309db5f1"}, - {file = "yarl-1.12.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:10b690cd78cbaca2f96a7462f303fdd2b596d3978b49892e4b05a7567c591572"}, - {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:c85ab016e96a975afbdb9d49ca90f3bca9920ef27c64300843fe91c3d59d8d20"}, - {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c1caa5763d1770216596e0a71b5567f27aac28c95992110212c108ec74589a48"}, - {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:595bbcdbfc4a9c6989d7489dca8510cba053ff46b16c84ffd95ac8e90711d419"}, - {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e64f0421892a207d3780903085c1b04efeb53b16803b23d947de5a7261b71355"}, - {file = "yarl-1.12.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:319c206e83e46ec2421b25b300c8482b6fe8a018baca246be308c736d9dab267"}, - {file = "yarl-1.12.1-cp310-cp310-win32.whl", hash = "sha256:da045bd1147d12bd43fb032296640a7cc17a7f2eaba67495988362e99db24fd2"}, - {file = "yarl-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:aebbd47df77190ada603157f0b3670d578c110c31746ecc5875c394fdcc59a99"}, - {file = "yarl-1.12.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:28389a68981676bf74e2e199fe42f35d1aa27a9c98e3a03e6f58d2d3d054afe1"}, - {file = "yarl-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f736f54565f8dd7e3ab664fef2bc461d7593a389a7f28d4904af8d55a91bd55f"}, - {file = "yarl-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dee0496d5f1a8f57f0f28a16f81a2033fc057a2cf9cd710742d11828f8c80e2"}, - {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8981a94a27ac520a398302afb74ae2c0be1c3d2d215c75c582186a006c9e7b0"}, - {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ff54340fc1129e8e181827e2234af3ff659b4f17d9bbe77f43bc19e6577fadec"}, - {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:54c8cee662b5f8c30ad7eedfc26123f845f007798e4ff1001d9528fe959fd23c"}, - {file = "yarl-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e97a29b37830ba1262d8dfd48ddb5b28ad4d3ebecc5d93a9c7591d98641ec737"}, - {file = "yarl-1.12.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6c89894cc6f6ddd993813e79244b36b215c14f65f9e4f1660b1f2ba9e5594b95"}, - {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:712ba8722c0699daf186de089ddc4677651eb9875ed7447b2ad50697522cbdd9"}, - {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6e9a9f50892153bad5046c2a6df153224aa6f0573a5a8ab44fc54a1e886f6e21"}, - {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1d4017e78fb22bc797c089b746230ad78ecd3cdb215bc0bd61cb72b5867da57e"}, - {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:f494c01b28645c431239863cb17af8b8d15b93b0d697a0320d5dd34cd9d7c2fa"}, - {file = "yarl-1.12.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:de4544b1fb29cf14870c4e2b8a897c0242449f5dcebd3e0366aa0aa3cf58a23a"}, - {file = "yarl-1.12.1-cp311-cp311-win32.whl", hash = "sha256:7564525a4673fde53dee7d4c307a961c0951918f0b8c7f09b2c9e02067cf6504"}, - {file = "yarl-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:f23bb1a7a6e8e8b612a164fdd08e683bcc16c76f928d6dbb7bdbee2374fbfee6"}, - {file = "yarl-1.12.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a3e2aff8b822ab0e0bdbed9f50494b3a35629c4b9488ae391659973a37a9f53f"}, - {file = "yarl-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:22dda2799c8d39041d731e02bf7690f0ef34f1691d9ac9dfcb98dd1e94c8b058"}, - {file = "yarl-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:18c2a7757561f05439c243f517dbbb174cadfae3a72dee4ae7c693f5b336570f"}, - {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:835010cc17d0020e7931d39e487d72c8e01c98e669b6896a8b8c9aa8ca69a949"}, - {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e2254fe137c4a360b0a13173a56444f756252c9283ba4d267ca8e9081cd140ea"}, - {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f6a071d2c3d39b4104f94fc08ab349e9b19b951ad4b8e3b6d7ea92d6ef7ccaf8"}, - {file = "yarl-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73a183042ae0918c82ce2df38c3db2409b0eeae88e3afdfc80fb67471a95b33b"}, - {file = "yarl-1.12.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:326b8a079a9afcac0575971e56dabdf7abb2ea89a893e6949b77adfeb058b50e"}, - {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:126309c0f52a2219b3d1048aca00766429a1346596b186d51d9fa5d2070b7b13"}, - {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:ba1c779b45a399cc25f511c681016626f69e51e45b9d350d7581998722825af9"}, - {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:af1107299cef049ad00a93df4809517be432283a0847bcae48343ebe5ea340dc"}, - {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:20d817c0893191b2ab0ba30b45b77761e8dfec30a029b7c7063055ca71157f84"}, - {file = "yarl-1.12.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d4f818f6371970d6a5d1e42878389bbfb69dcde631e4bbac5ec1cb11158565ca"}, - {file = "yarl-1.12.1-cp312-cp312-win32.whl", hash = "sha256:0ac33d22b2604b020569a82d5f8a03ba637ba42cc1adf31f616af70baf81710b"}, - {file = "yarl-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:fd24996e12e1ba7c397c44be75ca299da14cde34d74bc5508cce233676cc68d0"}, - {file = "yarl-1.12.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dea360778e0668a7ad25d7727d03364de8a45bfd5d808f81253516b9f2217765"}, - {file = "yarl-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1f50a37aeeb5179d293465e522fd686080928c4d89e0ff215e1f963405ec4def"}, - {file = "yarl-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0274b1b7a9c9c32b7bf250583e673ff99fb9fccb389215841e2652d9982de740"}, - {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4f3ab9eb8ab2d585ece959c48d234f7b39ac0ca1954a34d8b8e58a52064bdb3"}, - {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d31dd0245d88cf7239e96e8f2a99f815b06e458a5854150f8e6f0e61618d41b"}, - {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a96198d5d26f40557d986c1253bfe0e02d18c9d9b93cf389daf1a3c9f7c755fa"}, - {file = "yarl-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddae504cfb556fe220efae65e35be63cd11e3c314b202723fc2119ce19f0ca2e"}, - {file = "yarl-1.12.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bce00f3b1f7f644faae89677ca68645ed5365f1c7f874fdd5ebf730a69640d38"}, - {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eee5ff934b0c9f4537ff9596169d56cab1890918004791a7a06b879b3ba2a7ef"}, - {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4ea99e64b2ad2635e0f0597b63f5ea6c374791ff2fa81cdd4bad8ed9f047f56f"}, - {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:5c667b383529520b8dd6bd496fc318678320cb2a6062fdfe6d3618da6b8790f6"}, - {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d920401941cb898ef089422e889759dd403309eb370d0e54f1bdf6ca07fef603"}, - {file = "yarl-1.12.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:501a1576716032cc6d48c7c47bcdc42d682273415a8f2908e7e72cb4625801f3"}, - {file = "yarl-1.12.1-cp313-cp313-win32.whl", hash = "sha256:24416bb5e221e29ddf8aac5b97e94e635ca2c5be44a1617ad6fe32556df44294"}, - {file = "yarl-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:71af3766bb46738d12cc288d9b8de7ef6f79c31fd62757e2b8a505fe3680b27f"}, - {file = "yarl-1.12.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c924deab8105f86980983eced740433fb7554a7f66db73991affa4eda99d5402"}, - {file = "yarl-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5fb475a4cdde582c9528bb412b98f899680492daaba318231e96f1a0a1bb0d53"}, - {file = "yarl-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:36ee0115b9edca904153a66bb74a9ff1ce38caff015de94eadfb9ba8e6ecd317"}, - {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2631c9d7386bd2d4ce24ecc6ebf9ae90b3efd713d588d90504eaa77fec4dba01"}, - {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2376d8cf506dffd0e5f2391025ae8675b09711016656590cb03b55894161fcfa"}, - {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:24197ba3114cc85ddd4091e19b2ddc62650f2e4a899e51b074dfd52d56cf8c72"}, - {file = "yarl-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfdf419bf5d3644f94cd7052954fc233522f5a1b371fc0b00219ebd9c14d5798"}, - {file = "yarl-1.12.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8112f640a4f7e7bf59f7cabf0d47a29b8977528c521d73a64d5cc9e99e48a174"}, - {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:607d12f0901f6419a8adceb139847c42c83864b85371f58270e42753f9780fa6"}, - {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:664380c7ed524a280b6a2d5d9126389c3e96cd6e88986cdb42ca72baa27421d6"}, - {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:0d0a5e87bc48d76dfcfc16295201e9812d5f33d55b4a0b7cad1025b92bf8b91b"}, - {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:eff6bac402719c14e17efe845d6b98593c56c843aca6def72080fbede755fd1f"}, - {file = "yarl-1.12.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:22839d1d1eab9e4b427828a88a22beb86f67c14d8ff81175505f1cc8493f3500"}, - {file = "yarl-1.12.1-cp38-cp38-win32.whl", hash = "sha256:717f185086bb9d817d4537dd18d5df5d657598cd00e6fc22e4d54d84de266c1d"}, - {file = "yarl-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:71978ba778948760cff528235c951ea0ef7a4f9c84ac5a49975f8540f76c3f73"}, - {file = "yarl-1.12.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:30ffc046ebddccb3c4cac72c1a3e1bc343492336f3ca86d24672e90ccc5e788a"}, - {file = "yarl-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f10954b233d4df5cc3137ffa5ced97f8894152df817e5d149bf05a0ef2ab8134"}, - {file = "yarl-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2e912b282466444023610e4498e3795c10e7cfd641744524876239fcf01d538d"}, - {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6af871f70cfd5b528bd322c65793b5fd5659858cdfaa35fbe563fb99b667ed1f"}, - {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3e4e1f7b08d1ec6b685ccd3e2d762219c550164fbf524498532e39f9413436e"}, - {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9a7ee79183f0b17dcede8b6723e7da2ded529cf159a878214be9a5d3098f5b1e"}, - {file = "yarl-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96c8ff1e1dd680e38af0887927cab407a4e51d84a5f02ae3d6eb87233036c763"}, - {file = "yarl-1.12.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e9905fc2dc1319e4c39837b906a024cf71b1261cc66b0cd89678f779c0c61f5"}, - {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:01549468858b87d36f967c97d02e6e54106f444aeb947ed76f8f71f85ed07cec"}, - {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:96b34830bd6825ca0220bf005ea99ac83eb9ce51301ddb882dcf613ae6cd95fb"}, - {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2aee7594d2c2221c717a8e394bbed4740029df4c0211ceb0f04815686e99c795"}, - {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:15871130439ad10abb25a4631120d60391aa762b85fcab971411e556247210a0"}, - {file = "yarl-1.12.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:838dde2cb570cfbb4cab8a876a0974e8b90973ea40b3ac27a79b8a74c8a2db15"}, - {file = "yarl-1.12.1-cp39-cp39-win32.whl", hash = "sha256:eacbcf30efaca7dc5cb264228ffecdb95fdb1e715b1ec937c0ce6b734161e0c8"}, - {file = "yarl-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:76a59d1b63de859398bc7764c860a769499511463c1232155061fe0147f13e01"}, - {file = "yarl-1.12.1-py3-none-any.whl", hash = "sha256:dc3192a81ecd5ff954cecd690327badd5a84d00b877e1573f7c9097ce13e5bfb"}, - {file = "yarl-1.12.1.tar.gz", hash = "sha256:5b860055199aec8d6fe4dcee3c5196ce506ca198a50aab0059ffd26e8e815828"}, + {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:32468f41242d72b87ab793a86d92f885355bcf35b3355aa650bfa846a5c60058"}, + {file = "yarl-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:234f3a3032b505b90e65b5bc6652c2329ea7ea8855d8de61e1642b74b4ee65d2"}, + {file = "yarl-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8a0296040e5cddf074c7f5af4a60f3fc42c0237440df7bcf5183be5f6c802ed5"}, + {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de6c14dd7c7c0badba48157474ea1f03ebee991530ba742d381b28d4f314d6f3"}, + {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b140e532fe0266003c936d017c1ac301e72ee4a3fd51784574c05f53718a55d8"}, + {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:019f5d58093402aa8f6661e60fd82a28746ad6d156f6c5336a70a39bd7b162b9"}, + {file = "yarl-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c42998fd1cbeb53cd985bff0e4bc25fbe55fd6eb3a545a724c1012d69d5ec84"}, + {file = "yarl-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c7c30fb38c300fe8140df30a046a01769105e4cf4282567a29b5cdb635b66c4"}, + {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e49e0fd86c295e743fd5be69b8b0712f70a686bc79a16e5268386c2defacaade"}, + {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:b9ca7b9147eb1365c8bab03c003baa1300599575effad765e0b07dd3501ea9af"}, + {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:27e11db3f1e6a51081a981509f75617b09810529de508a181319193d320bc5c7"}, + {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8994c42f4ca25df5380ddf59f315c518c81df6a68fed5bb0c159c6cb6b92f120"}, + {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:542fa8e09a581bcdcbb30607c7224beff3fdfb598c798ccd28a8184ffc18b7eb"}, + {file = "yarl-1.16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2bd6a51010c7284d191b79d3b56e51a87d8e1c03b0902362945f15c3d50ed46b"}, + {file = "yarl-1.16.0-cp310-cp310-win32.whl", hash = "sha256:178ccb856e265174a79f59721031060f885aca428983e75c06f78aa24b91d929"}, + {file = "yarl-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:fe8bba2545427418efc1929c5c42852bdb4143eb8d0a46b09de88d1fe99258e7"}, + {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d8643975a0080f361639787415a038bfc32d29208a4bf6b783ab3075a20b1ef3"}, + {file = "yarl-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:676d96bafc8c2d0039cea0cd3fd44cee7aa88b8185551a2bb93354668e8315c2"}, + {file = "yarl-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d9525f03269e64310416dbe6c68d3b23e5d34aaa8f47193a1c45ac568cecbc49"}, + {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b37d5ec034e668b22cf0ce1074d6c21fd2a08b90d11b1b73139b750a8b0dd97"}, + {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4f32c4cb7386b41936894685f6e093c8dfaf0960124d91fe0ec29fe439e201d0"}, + {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b8e265a0545637492a7e12fd7038370d66c9375a61d88c5567d0e044ded9202"}, + {file = "yarl-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:789a3423f28a5fff46fbd04e339863c169ece97c827b44de16e1a7a42bc915d2"}, + {file = "yarl-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1d1f45e3e8d37c804dca99ab3cf4ab3ed2e7a62cd82542924b14c0a4f46d243"}, + {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:621280719c4c5dad4c1391160a9b88925bb8b0ff6a7d5af3224643024871675f"}, + {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ed097b26f18a1f5ff05f661dc36528c5f6735ba4ce8c9645e83b064665131349"}, + {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:2f1fe2b2e3ee418862f5ebc0c0083c97f6f6625781382f828f6d4e9b614eba9b"}, + {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:87dd10bc0618991c66cee0cc65fa74a45f4ecb13bceec3c62d78ad2e42b27a16"}, + {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:4199db024b58a8abb2cfcedac7b1292c3ad421684571aeb622a02f242280e8d6"}, + {file = "yarl-1.16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:99a9dcd4b71dd5f5f949737ab3f356cfc058c709b4f49833aeffedc2652dac56"}, + {file = "yarl-1.16.0-cp311-cp311-win32.whl", hash = "sha256:a9394c65ae0ed95679717d391c862dece9afacd8fa311683fc8b4362ce8a410c"}, + {file = "yarl-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:5b9101f528ae0f8f65ac9d64dda2bb0627de8a50344b2f582779f32fda747c1d"}, + {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:4ffb7c129707dd76ced0a4a4128ff452cecf0b0e929f2668ea05a371d9e5c104"}, + {file = "yarl-1.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1a5e9d8ce1185723419c487758d81ac2bde693711947032cce600ca7c9cda7d6"}, + {file = "yarl-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d743e3118b2640cef7768ea955378c3536482d95550222f908f392167fe62059"}, + {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:26768342f256e6e3c37533bf9433f5f15f3e59e3c14b2409098291b3efaceacb"}, + {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1b0796168b953bca6600c5f97f5ed407479889a36ad7d17183366260f29a6b9"}, + {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:858728086914f3a407aa7979cab743bbda1fe2bdf39ffcd991469a370dd7414d"}, + {file = "yarl-1.16.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5570e6d47bcb03215baf4c9ad7bf7c013e56285d9d35013541f9ac2b372593e7"}, + {file = "yarl-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:66ea8311422a7ba1fc79b4c42c2baa10566469fe5a78500d4e7754d6e6db8724"}, + {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:649bddcedee692ee8a9b7b6e38582cb4062dc4253de9711568e5620d8707c2a3"}, + {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:3a91654adb7643cb21b46f04244c5a315a440dcad63213033826549fa2435f71"}, + {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b439cae82034ade094526a8f692b9a2b5ee936452de5e4c5f0f6c48df23f8604"}, + {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:571f781ae8ac463ce30bacebfaef2c6581543776d5970b2372fbe31d7bf31a07"}, + {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:aa7943f04f36d6cafc0cf53ea89824ac2c37acbdb4b316a654176ab8ffd0f968"}, + {file = "yarl-1.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1a5cf32539373ff39d97723e39a9283a7277cbf1224f7aef0c56c9598b6486c3"}, + {file = "yarl-1.16.0-cp312-cp312-win32.whl", hash = "sha256:a5b6c09b9b4253d6a208b0f4a2f9206e511ec68dce9198e0fbec4f160137aa67"}, + {file = "yarl-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:1208ca14eed2fda324042adf8d6c0adf4a31522fa95e0929027cd487875f0240"}, + {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5ace0177520bd4caa99295a9b6fb831d0e9a57d8e0501a22ffaa61b4c024283"}, + {file = "yarl-1.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7118bdb5e3ed81acaa2095cba7ec02a0fe74b52a16ab9f9ac8e28e53ee299732"}, + {file = "yarl-1.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38fec8a2a94c58bd47c9a50a45d321ab2285ad133adefbbadf3012c054b7e656"}, + {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8791d66d81ee45866a7bb15a517b01a2bcf583a18ebf5d72a84e6064c417e64b"}, + {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1cf936ba67bc6c734f3aa1c01391da74ab7fc046a9f8bbfa230b8393b90cf472"}, + {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d1aab176dd55b59f77a63b27cffaca67d29987d91a5b615cbead41331e6b7428"}, + {file = "yarl-1.16.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:995d0759004c08abd5d1b81300a91d18c8577c6389300bed1c7c11675105a44d"}, + {file = "yarl-1.16.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bc22e00edeb068f71967ab99081e9406cd56dbed864fc3a8259442999d71552"}, + {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:35b4f7842154176523e0a63c9b871168c69b98065d05a4f637fce342a6a2693a"}, + {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:7ace71c4b7a0c41f317ae24be62bb61e9d80838d38acb20e70697c625e71f120"}, + {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8f639e3f5795a6568aa4f7d2ac6057c757dcd187593679f035adbf12b892bb00"}, + {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:e8be3aff14f0120ad049121322b107f8a759be76a6a62138322d4c8a337a9e2c"}, + {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:122d8e7986043d0549e9eb23c7fd23be078be4b70c9eb42a20052b3d3149c6f2"}, + {file = "yarl-1.16.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0fd9c227990f609c165f56b46107d0bc34553fe0387818c42c02f77974402c36"}, + {file = "yarl-1.16.0-cp313-cp313-win32.whl", hash = "sha256:595ca5e943baed31d56b33b34736461a371c6ea0038d3baec399949dd628560b"}, + {file = "yarl-1.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:921b81b8d78f0e60242fb3db615ea3f368827a76af095d5a69f1c3366db3f596"}, + {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ab2b2ac232110a1fdb0d3ffcd087783edd3d4a6ced432a1bf75caf7b7be70916"}, + {file = "yarl-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7f8713717a09acbfee7c47bfc5777e685539fefdd34fa72faf504c8be2f3df4e"}, + {file = "yarl-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:cdcffe1dbcb4477d2b4202f63cd972d5baa155ff5a3d9e35801c46a415b7f71a"}, + {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a91217208306d82357c67daeef5162a41a28c8352dab7e16daa82e3718852a7"}, + {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ab3ed42c78275477ea8e917491365e9a9b69bb615cb46169020bd0aa5e2d6d3"}, + {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:707ae579ccb3262dfaef093e202b4c3fb23c3810e8df544b1111bd2401fd7b09"}, + {file = "yarl-1.16.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad7a852d1cd0b8d8b37fc9d7f8581152add917a98cfe2ea6e241878795f917ae"}, + {file = "yarl-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3f1cc3d3d4dc574bebc9b387f6875e228ace5748a7c24f49d8f01ac1bc6c31b"}, + {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5ff96da263740779b0893d02b718293cc03400c3a208fc8d8cd79d9b0993e532"}, + {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:3d375a19ba2bfe320b6d873f3fb165313b002cef8b7cc0a368ad8b8a57453837"}, + {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:62c7da0ad93a07da048b500514ca47b759459ec41924143e2ddb5d7e20fd3db5"}, + {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:147b0fcd0ee33b4b5f6edfea80452d80e419e51b9a3f7a96ce98eaee145c1581"}, + {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:504e1fe1cc4f170195320eb033d2b0ccf5c6114ce5bf2f617535c01699479bca"}, + {file = "yarl-1.16.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:bdcf667a5dec12a48f669e485d70c54189f0639c2157b538a4cffd24a853624f"}, + {file = "yarl-1.16.0-cp39-cp39-win32.whl", hash = "sha256:e9951afe6557c75a71045148890052cb942689ee4c9ec29f5436240e1fcc73b7"}, + {file = "yarl-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d7aaa8ff95d0840e289423e7dc35696c2b058d635f945bf05b5cd633146b027"}, + {file = "yarl-1.16.0-py3-none-any.whl", hash = "sha256:e6980a558d8461230c457218bd6c92dfc1d10205548215c2c21d79dc8d0a96f3"}, + {file = "yarl-1.16.0.tar.gz", hash = "sha256:b6f687ced5510a9a2474bbae96a4352e5ace5fa34dc44a217b0537fec1db00b4"}, ] [package.dependencies] idna = ">=2.0" multidict = ">=4.0" +propcache = ">=0.2.0" [[package]] name = "zipp" @@ -3833,4 +3966,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "be5965e4e13fed32347983dba1690661f3e59a50d59fa26d4f2a8345418dd5a1" +content-hash = "500fa44255e4a6c89a16314a931548447afe1ba71ea341a73cad6670e46ddac7" diff --git a/server/pyproject.toml b/server/pyproject.toml index 6ea4718d715..d08d0b8f488 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -41,10 +41,10 @@ py-cpuinfo = "^9.0.0" numpy = "^1.26" marlin-kernels = [ - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, - { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.2.0/marlin_kernels-0.2.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, + { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, + { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp310-cp310-linux_x86_64.whl", python = "~3.10", optional = true }, + { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp311-cp311-linux_x86_64.whl", python = "~3.11", optional = true }, + { url = "https://github.com/danieldk/marlin-kernels/releases/download/v0.3.0/marlin_kernels-0.3.0+cu123torch2.4-cp312-cp312-linux_x86_64.whl", python = "~3.12", optional = true }, ] moe-kernels = [ { url = "https://github.com/danieldk/moe-kernels/releases/download/v0.6.0/moe_kernels-0.6.0+cu123torch2.4-cp39-cp39-linux_x86_64.whl", python = "~3.9", optional = true }, diff --git a/server/requirements_cuda.txt b/server/requirements_cuda.txt index 5de75b6b3e3..e3f6d20f8c7 100644 --- a/server/requirements_cuda.txt +++ b/server/requirements_cuda.txt @@ -1,5 +1,5 @@ certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13" -charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13" +charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13" click==8.1.7 ; python_version >= "3.9" and python_version < "3.13" colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows") deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" @@ -10,7 +10,7 @@ googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.66.1 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13" huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13" idna==3.10 ; python_version >= "3.9" and python_version < "3.13" @@ -38,14 +38,14 @@ pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13" regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13" requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13" -rich==13.8.1 ; python_version >= "3.9" and python_version < "3.13" +rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13" scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13" -setuptools==75.1.0 ; python_version >= "3.9" and python_version < "3.13" -tokenizers==0.20.0 ; python_version >= "3.9" and python_version < "3.13" +setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" +tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.45.0 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_intel.txt b/server/requirements_intel.txt index 5de75b6b3e3..e3f6d20f8c7 100644 --- a/server/requirements_intel.txt +++ b/server/requirements_intel.txt @@ -1,5 +1,5 @@ certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13" -charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13" +charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13" click==8.1.7 ; python_version >= "3.9" and python_version < "3.13" colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows") deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" @@ -10,7 +10,7 @@ googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.66.1 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13" huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13" idna==3.10 ; python_version >= "3.9" and python_version < "3.13" @@ -38,14 +38,14 @@ pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13" regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13" requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13" -rich==13.8.1 ; python_version >= "3.9" and python_version < "3.13" +rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13" scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13" -setuptools==75.1.0 ; python_version >= "3.9" and python_version < "3.13" -tokenizers==0.20.0 ; python_version >= "3.9" and python_version < "3.13" +setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" +tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.45.0 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/requirements_rocm.txt b/server/requirements_rocm.txt index 5de75b6b3e3..e3f6d20f8c7 100644 --- a/server/requirements_rocm.txt +++ b/server/requirements_rocm.txt @@ -1,5 +1,5 @@ certifi==2024.8.30 ; python_version >= "3.9" and python_version < "3.13" -charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "3.13" +charset-normalizer==3.4.0 ; python_version >= "3.9" and python_version < "3.13" click==8.1.7 ; python_version >= "3.9" and python_version < "3.13" colorama==0.4.6 ; python_version >= "3.9" and python_version < "3.13" and (sys_platform == "win32" or platform_system == "Windows") deprecated==1.2.14 ; python_version >= "3.9" and python_version < "3.13" @@ -10,7 +10,7 @@ googleapis-common-protos==1.65.0 ; python_version >= "3.9" and python_version < grpc-interceptor==0.15.4 ; python_version >= "3.9" and python_version < "3.13" grpcio-reflection==1.62.3 ; python_version >= "3.9" and python_version < "3.13" grpcio-status==1.62.3 ; python_version >= "3.9" and python_version < "3.13" -grpcio==1.66.1 ; python_version >= "3.9" and python_version < "3.13" +grpcio==1.67.0 ; python_version >= "3.9" and python_version < "3.13" hf-transfer==0.1.8 ; python_version >= "3.9" and python_version < "3.13" huggingface-hub==0.23.5 ; python_version >= "3.9" and python_version < "3.13" idna==3.10 ; python_version >= "3.9" and python_version < "3.13" @@ -38,14 +38,14 @@ pygments==2.18.0 ; python_version >= "3.9" and python_version < "3.13" pyyaml==6.0.2 ; python_version >= "3.9" and python_version < "3.13" regex==2024.9.11 ; python_version >= "3.9" and python_version < "3.13" requests==2.32.3 ; python_version >= "3.9" and python_version < "3.13" -rich==13.8.1 ; python_version >= "3.9" and python_version < "3.13" +rich==13.9.3 ; python_version >= "3.9" and python_version < "3.13" safetensors==0.4.5 ; python_version >= "3.9" and python_version < "3.13" scipy==1.13.1 ; python_version >= "3.9" and python_version < "3.13" sentencepiece==0.2.0 ; python_version >= "3.9" and python_version < "3.13" -setuptools==75.1.0 ; python_version >= "3.9" and python_version < "3.13" -tokenizers==0.20.0 ; python_version >= "3.9" and python_version < "3.13" +setuptools==75.2.0 ; python_version >= "3.9" and python_version < "3.13" +tokenizers==0.20.1 ; python_version >= "3.9" and python_version < "3.13" tqdm==4.66.5 ; python_version >= "3.9" and python_version < "3.13" -transformers==4.45.0 ; python_version >= "3.9" and python_version < "3.13" +transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13" typer==0.6.1 ; python_version >= "3.9" and python_version < "3.13" typing-extensions==4.12.2 ; python_version >= "3.9" and python_version < "3.13" urllib3==2.2.3 ; python_version >= "3.9" and python_version < "3.13" diff --git a/server/tests/conftest.py b/server/tests/conftest.py index d99771f8ad0..9822279214f 100644 --- a/server/tests/conftest.py +++ b/server/tests/conftest.py @@ -2,7 +2,7 @@ import os from text_generation_server.pb import generate_pb2 -os.environ["USE_PREFIX_CACHING"] = "1" +os.environ["PREFIX_CACHING"] = "1" os.environ["ATTENTION"] = "flashinfer" diff --git a/server/text_generation_server/cli.py b/server/text_generation_server/cli.py index db390234e43..a363b33a89a 100644 --- a/server/text_generation_server/cli.py +++ b/server/text_generation_server/cli.py @@ -31,6 +31,7 @@ class Dtype(str, Enum): class KVCacheDtype(str, Enum): + fp8_e4m3fn = "fp8_e4m3fn" fp8_e5m2 = "fp8_e5m2" diff --git a/server/text_generation_server/interceptor.py b/server/text_generation_server/interceptor.py index 57df172575a..a5c023e4ecf 100644 --- a/server/text_generation_server/interceptor.py +++ b/server/text_generation_server/interceptor.py @@ -9,6 +9,9 @@ class ExceptionInterceptor(AsyncServerInterceptor): + def __init__(self, shutdown_callback): + self.shutdown_callback = shutdown_callback + async def intercept( self, method: Callable, @@ -25,7 +28,7 @@ async def intercept( # Runtime Error cannot be recovered from if isinstance(err, RuntimeError): - exit(1) + self.shutdown_callback() if torch.cuda.is_available(): torch.cuda.empty_cache() diff --git a/server/text_generation_server/layers/attention/__init__.py b/server/text_generation_server/layers/attention/__init__.py index cc7f0caaad9..ebe32042c46 100644 --- a/server/text_generation_server/layers/attention/__init__.py +++ b/server/text_generation_server/layers/attention/__init__.py @@ -8,39 +8,32 @@ raise ImportError("`USE_FLASH_ATTENTION` is false.") if SYSTEM == "cuda": from .cuda import ( - PREFILL_IN_KV_CACHE, SUPPORTS_WINDOWING, attention, paged_attention, - reshape_and_cache, ) elif SYSTEM == "rocm": from .rocm import ( - PREFILL_IN_KV_CACHE, SUPPORTS_WINDOWING, attention, paged_attention, - reshape_and_cache, ) elif SYSTEM == "ipex": from .ipex import ( - PREFILL_IN_KV_CACHE, SUPPORTS_WINDOWING, attention, paged_attention, - reshape_and_cache, ) else: raise ImportError(f"System {SYSTEM} doesn't support flash/paged attention") # KVCache needs `reshape_and_cache`, so ensure that it is defined already. -from .kv_cache import KVCache +from .kv_cache import KVCache, get_kv_scales __all__ = [ "attention", + "get_kv_scales", "paged_attention", - "reshape_and_cache", - "PREFILL_IN_KV_CACHE", "SUPPORTS_WINDOWING", "KVCache", "Seqlen", diff --git a/server/text_generation_server/layers/attention/common.py b/server/text_generation_server/layers/attention/common.py index c8ac0c2ab95..a3b919ee4fd 100644 --- a/server/text_generation_server/layers/attention/common.py +++ b/server/text_generation_server/layers/attention/common.py @@ -1,72 +1,52 @@ from dataclasses import dataclass -from text_generation_server.utils.import_utils import SYSTEM -from text_generation_server.models.globals import ATTENTION import torch from typing import Optional -if ATTENTION in {"flashinfer", "flashdecoding"}: - - @dataclass - class Seqlen: - input_lengths: torch.Tensor - prefix_lengths: torch.Tensor - cu_seqlen_q: Optional[torch.Tensor] - cu_seqlen_k: Optional[torch.Tensor] - max_q: int - max_k: int - - def __init__( - self, - input_lengths, - prefix_lengths, - cu_seqlen_q=None, - max_q=None, - max_k=None, - ): - self.input_lengths = input_lengths - self.prefix_lengths = prefix_lengths - device = self.input_lengths.device - shape = self.input_lengths.shape - if cu_seqlen_q is None: - cu_seqlen_q = torch.arange( - shape[0] + 1, - device=device, - dtype=torch.int32, - ) - max_q = 1 - else: - assert max_q is not None - assert max_k is not None - cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32) - - # cuda graphs don't like this and this is necessary to clamp within mistral - # Although FA2 might not want the clamping - # cu_seqlen_k[0] = 0 - total = self.input_lengths + self.prefix_lengths - torch.cumsum(total, -1, out=cu_seqlen_k[1:]) - - self.cu_seqlen_q = cu_seqlen_q - self.cu_seqlen_k = cu_seqlen_k - self.max_q = max_q - self.max_k = max_k - - def clamp(self, max): - # Flash decoding doesn't need to clamp - return self - -else: - - @dataclass - class Seqlen: - input_lengths: torch.Tensor - prefix_lengths: torch.Tensor - cu_seqlen_q: torch.Tensor - max_q: int - max_k: int - - def clamp(self, max): - if SYSTEM == "rocm": - return self - self.input_lengths = torch.clamp(self.input_lengths, max=max) - return self +@dataclass +class Seqlen: + input_lengths: torch.Tensor + cache_lengths: torch.Tensor + cu_seqlen_q: Optional[torch.Tensor] + cu_seqlen_k: Optional[torch.Tensor] + max_q: int + max_k: int + + def __init__( + self, + input_lengths, + cache_lengths, + cu_seqlen_q=None, + max_q=None, + max_k=None, + ): + self.input_lengths = input_lengths + self.cache_lengths = cache_lengths + device = self.input_lengths.device + shape = self.input_lengths.shape + if cu_seqlen_q is None: + cu_seqlen_q = torch.arange( + shape[0] + 1, + device=device, + dtype=torch.int32, + ) + max_q = 1 + else: + assert max_q is not None + assert max_k is not None + cu_seqlen_k = torch.zeros(shape[-1] + 1, device=device, dtype=torch.int32) + + # cuda graphs don't like this and this is necessary to clamp within mistral + # Although FA2 might not want the clamping + # cu_seqlen_k[0] = 0 + total = self.input_lengths + self.cache_lengths + torch.cumsum(total, -1, out=cu_seqlen_k[1:]) + + self.cu_seqlen_q = cu_seqlen_q + self.cu_seqlen_k = cu_seqlen_k + self.max_q = max_q + self.max_k = max_k + + def clamp(self, max): + # Flash decoding doesn't need to clamp + return self diff --git a/server/text_generation_server/layers/attention/cuda.py b/server/text_generation_server/layers/attention/cuda.py index cd3ea369b61..d705afb0bd2 100644 --- a/server/text_generation_server/layers/attention/cuda.py +++ b/server/text_generation_server/layers/attention/cuda.py @@ -1,4 +1,5 @@ import torch +from text_generation_server.layers.attention.kv_cache import KVCache, KVScales from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.models.globals import ( ATTENTION, @@ -7,44 +8,22 @@ from text_generation_server.layers.attention import Seqlen from typing import Optional + major, minor = torch.cuda.get_device_capability() is_sm75 = major == 7 and minor == 5 _PARTITION_SIZE = 512 -try: - from vllm._C import cache_ops -except Exception as e: - raise ImportError( - f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" - ) - - -def reshape_and_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slots: torch.Tensor, -): - if ATTENTION in {"flashdecoding", "flashinfer"}: - shape = key_cache.shape - key_cache.view(-1, shape[-2], shape[-1])[slots] = key - value_cache.view(-1, shape[-2], shape[-1])[slots] = value - else: - cache_ops.reshape_and_cache( - key, value, key_cache, value_cache, slots, "auto", 1.0 - ) - def paged_attention( query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, + kv_cache: KVCache, kv_head_mapping: torch.Tensor, softmax_scale: float, block_tables: torch.Tensor, seqlen: Seqlen, max_s: int, + *, + kv_scales: KVScales, softcap: Optional[float] = None, ): # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py @@ -70,6 +49,8 @@ def paged_attention( num_seqs, num_heads, head_size = query.shape max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE + can_scale = kv_cache.can_scale(kv_scales) + # NOTE(woosuk): We use a simple heuristic to decide whether to use # PagedAttention V1 or V2. If the number of partitions is 1, we use # V1 to avoid the overhead of reduction. Also, if the number of @@ -79,10 +60,13 @@ def paged_attention( from text_generation_server.layers.attention.flashinfer import decode_state return decode_state.get().forward( + # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged. query.contiguous(), - paged_kv_cache=(key_cache, value_cache), + paged_kv_cache=(kv_cache.key, kv_cache.value), logits_soft_cap=softcap, sm_scale=softmax_scale, + k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, + v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, ) elif ATTENTION == "flashdecoding": max_q = 1 @@ -98,8 +82,8 @@ def paged_attention( softcap = 0.0 out = flash_attn_2_cuda.varlen_fwd( query, - key_cache, - value_cache, + kv_cache.key, + kv_cache.value, None, seqlen.cu_seqlen_q, seqlen.cu_seqlen_k, @@ -123,7 +107,7 @@ def paged_attention( else: if softcap is not None: raise RuntimeError("Paged attention doesn't support softcapping") - input_lengths = seqlen.input_lengths + input_lengths = seqlen.input_lengths + seqlen.cache_lengths from vllm._C import ops out = torch.empty_like(query) @@ -135,8 +119,8 @@ def paged_attention( ops.paged_attention_v1( out, query, - key_cache, - value_cache, + kv_cache.key, + kv_cache.value, kv_head_mapping, softmax_scale, block_tables, @@ -168,8 +152,8 @@ def paged_attention( max_logits, tmp_output, query, - key_cache, - value_cache, + kv_cache.key, + kv_cache.value, kv_head_mapping, softmax_scale, block_tables, @@ -216,60 +200,69 @@ def paged_attention( ) from e +if ATTENTION == "flashdecoding" and not V2: + raise ValueError("Flash decoding requires Flash Attention V2") + SUPPORTS_WINDOWING = V2 -if ATTENTION == "flashinfer": - - def attention( - q: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - seqlen: Seqlen, - block_tables: torch.Tensor, - softmax_scale, - window_size_left=-1, - causal=True, - softcap=0.0, - ): + +def attention( + *, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: KVCache, + kv_scales: KVScales, + seqlen: Seqlen, + block_tables: torch.Tensor, + softmax_scale: float, + window_size_left: int = -1, + causal: bool = True, + softcap: Optional[float] = None, +): + can_scale = kv_cache.can_scale(kv_scales) + + if ATTENTION == "flashinfer": from text_generation_server.layers.attention.flashinfer import ( prefill_with_paged_kv_state, ) + if softcap is None: + softcap = 0.0 + return prefill_with_paged_kv_state.get().forward( - q.contiguous(), + # TODO: remove `contiguous` call once https://github.com/flashinfer-ai/flashinfer/pull/553 is merged. + query.contiguous(), causal=causal, - paged_kv_cache=(key_cache, value_cache), + paged_kv_cache=(kv_cache.key, kv_cache.value), logits_soft_cap=softcap, sm_scale=softmax_scale, window_left=window_size_left, + k_scale=kv_scales.key_scale_cpu if can_scale else 1.0, + v_scale=kv_scales.value_scale_cpu if can_scale else 1.0, ) -elif V2: - - def attention( - q, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - seqlen: Seqlen, - block_tables: torch.Tensor, - softmax_scale, - window_size_left=-1, - causal=True, - softcap=0.0, - ): - out = torch.empty_like(q) + # If we are using flashdecoding or paged, we always use flash-attn for + # the prefill. We have to branch on whether we use flash-attn v1 or v2. + elif V2: + out = torch.empty_like(query) if window_size_left <= 0 and window_size_left != -1: raise ValueError("`window_size_left` must be > 0 or -1") + + if softcap is None: + softcap = 0.0 + return flash_attn_2_cuda.varlen_fwd( - q, - key_cache, - value_cache, + query, + # flashdecoding: pass the KV caches, paged: pass the KV. + kv_cache.key if ATTENTION == "flashdecoding" else key, + kv_cache.value if ATTENTION == "flashdecoding" else value, out, seqlen.cu_seqlen_q, seqlen.cu_seqlen_k, None, None, - block_tables, + block_tables if ATTENTION == "flashdecoding" else None, None, seqlen.max_q, seqlen.max_k, @@ -284,57 +277,45 @@ def attention( None, )[0] -else: - - def attention( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - seqlen: Seqlen, - block_tables: torch.Tensor, - softmax_scale: float, - window_size_left: int = -1, - causal: bool = True, - softcap=None, - ): + else: if window_size_left != -1: raise NotImplementedError( "window_size_left is only available with flash attn v2" ) if softcap is not None: - raise NotImplementedError("softcap is only available with flash attn v2") + raise NotImplementedError("softcap is not available in flash attn v1") # Flash attention v1 requires q, k and v to have the same number of heads - if k.shape[1] != q.shape[1]: + if key.shape[1] != query.shape[1]: # MQA expand - if k.shape[1] == 1: - k = k.expand(-1, q.shape[1], -1) + if key.shape[1] == 1: + key = key.expand(-1, query.shape[1], -1) # Grouped attention reshape else: - original_shape = k.shape - k = ( - k.unsqueeze(2) - .expand(-1, -1, q.shape[1] // k.shape[1], -1) + original_shape = key.shape + key = ( + key.unsqueeze(2) + .expand(-1, -1, query.shape[1] // key.shape[1], -1) .reshape(original_shape[0], -1, original_shape[2]) ) - if v.shape[1] != q.shape[1]: + if value.shape[1] != query.shape[1]: # MQA expand - if v.shape[1] == 1: - v = v.expand(-1, q.shape[1], -1) + if value.shape[1] == 1: + value = value.expand(-1, query.shape[1], -1) # Grouped attention reshape else: - original_shape = v.shape - v = ( - v.unsqueeze(2) - .expand(-1, -1, q.shape[1] // v.shape[1], -1) + original_shape = value.shape + value = ( + value.unsqueeze(2) + .expand(-1, -1, query.shape[1] // value.shape[1], -1) .reshape(original_shape[0], -1, original_shape[2]) ) - out = torch.empty_like(q) + out = torch.empty_like(query) flash_attn_cuda.fwd( - q, - k, - v, + query, + key, + value, out, seqlen.cu_seqlen_q, seqlen.cu_seqlen_q, @@ -351,15 +332,8 @@ def attention( return out -# Prefill in the cache with every kind of attention, unless we -# have a configuration that requires flash-attention v1, which -# does not support block tables. -PREFILL_IN_KV_CACHE = ATTENTION != "paged" or V2 - __all__ = [ - "PREFILL_IN_KV_CACHE", "SUPPORTS_WINDOWING", "attention", "paged_attention", - "reshape_and_cache", ] diff --git a/server/text_generation_server/layers/attention/flash_attn_triton.py b/server/text_generation_server/layers/attention/flash_attn_triton.py index 3a6f9a730ce..fd180f0f19a 100644 --- a/server/text_generation_server/layers/attention/flash_attn_triton.py +++ b/server/text_generation_server/layers/attention/flash_attn_triton.py @@ -699,7 +699,6 @@ def check_args( class _attention(torch.autograd.Function): - @staticmethod def forward( ctx, diff --git a/server/text_generation_server/layers/attention/flashinfer.py b/server/text_generation_server/layers/attention/flashinfer.py index d603c6f5f03..26a72d9be71 100644 --- a/server/text_generation_server/layers/attention/flashinfer.py +++ b/server/text_generation_server/layers/attention/flashinfer.py @@ -204,6 +204,7 @@ def use_decode_state( num_kv_heads: int, head_size: int, page_size: int, + kv_cache_dtype: torch.dtype, dtype: torch.dtype, window_left: int, ): @@ -240,7 +241,7 @@ def use_decode_state( num_kv_heads=num_kv_heads, head_dim=head_size, page_size=page_size, - data_type=dtype, + data_type=kv_cache_dtype, q_data_type=dtype, window_left=window_left, ) diff --git a/server/text_generation_server/layers/attention/ipex.py b/server/text_generation_server/layers/attention/ipex.py index 131c9bb0644..677f3f5647d 100644 --- a/server/text_generation_server/layers/attention/ipex.py +++ b/server/text_generation_server/layers/attention/ipex.py @@ -1,31 +1,37 @@ import intel_extension_for_pytorch as ipex import torch +from text_generation_server.layers.attention.kv_cache import KVCache, KVScales from text_generation_server.models.flash_causal_lm import BLOCK_SIZE from text_generation_server.layers.attention import Seqlen from typing import Optional SUPPORTS_WINDOWING = False -PREFILL_IN_KV_CACHE = False def attention( - q: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, + *, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: KVCache, + kv_scales: KVScales, seqlen: Seqlen, block_tables: torch.Tensor, - softmax_scale, - window_size_left=-1, - causal=True, + softmax_scale: float, + window_size_left: int = -1, + causal: bool = True, softcap: Optional[float] = None, ): - out = torch.empty_like(q) + if softcap is not None: + raise NotImplementedError("softcap is not available in IPEX") + + out = torch.empty_like(query) # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load. ipex.llm.functional.varlen_attention( - q.contiguous() if q.device.type == "xpu" else q, - key_cache.contiguous() if key_cache.device.type == "xpu" else key_cache, - value_cache.contiguous() if value_cache.device.type == "xpu" else value_cache, + query.contiguous() if query.device.type == "xpu" else query, + key.contiguous() if key.device.type == "xpu" else key, + value.contiguous() if value.device.type == "xpu" else value, out, seqlen.cu_seqlen_q, seqlen.cu_seqlen_q, @@ -42,39 +48,32 @@ def attention( return out -def reshape_and_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slots: torch.Tensor, -): - ipex.llm.modules.PagedAttention.reshape_and_cache( - key, value, key_cache, value_cache, slots - ) - - def paged_attention( query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, + kv_cache: KVCache, kv_head_mapping: torch.Tensor, softmax_scale: float, block_tables: torch.Tensor, seqlen: Seqlen, max_s: int, + *, + kv_scales: KVScales, softcap: Optional[float] = None, ): + if softcap is not None: + raise NotImplementedError("softcap is not available in IPEX") + out = torch.empty_like(query) + input_lengths = seqlen.input_lengths + seqlen.cache_lengths ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( out, query, - key_cache, - value_cache, + kv_cache.key, + kv_cache.value, kv_head_mapping, softmax_scale, block_tables, - seqlen.input_lengths, + input_lengths, BLOCK_SIZE, max_s, None, @@ -83,9 +82,7 @@ def paged_attention( __all__ = [ - "PREFILL_IN_KV_CACHE", "SUPPORTS_WINDOWING", "attention", "paged_attention", - "reshape_and_cache", ] diff --git a/server/text_generation_server/layers/attention/kv_cache.py b/server/text_generation_server/layers/attention/kv_cache.py index 3960c954985..9d739da5ee1 100644 --- a/server/text_generation_server/layers/attention/kv_cache.py +++ b/server/text_generation_server/layers/attention/kv_cache.py @@ -1,9 +1,38 @@ from typing import Tuple +from dataclasses import dataclass, field +from loguru import logger import torch + +from text_generation_server.layers.fp8 import fp8_quantize from text_generation_server.models.globals import ATTENTION, BLOCK_SIZE from text_generation_server.utils.import_utils import SYSTEM -from text_generation_server.layers.attention import reshape_and_cache +from text_generation_server.utils.log import log_once +from text_generation_server.utils.weights import Weights + + +@dataclass +class KVScales: + """ + Key-value scales for FP8 KV cache. + + This data class stores key and value scales both as a GPU tensor and + as a GPU float. This inconvenience is necessary because some functions + (e.g. scaling kernels) take scales as a GPU tensor, whereas others + (e.g. flashinfer) take scales as a CPU scalar. + """ + + key_scale: torch.Tensor + value_scale: torch.Tensor + key_scale_cpu: float = field(init=False) + value_scale_cpu: float = field(init=False) + + def __post_init__(self): + if self.key_scale.numel() != 1 or self.value_scale.numel() != 1: + raise ValueError("Key and value scales must be scalar tensors.") + + self.key_scale_cpu = self.key_scale.item() + self.value_scale_cpu = self.value_scale.item() class KVCache: @@ -24,11 +53,11 @@ def __init__( ): """Construct the key-value cache for a layer.""" - if dtype == torch.float8_e5m2 and ( + if dtype in {torch.float8_e5m2, torch.float8_e4m3fn} and ( ATTENTION != "flashinfer" or SYSTEM != "cuda" ): raise ValueError( - "float8_e5m2 KV cache is currently only supported for flashinfer on CUDA" + "FP8 KV cache is currently only supported for flashinfer on CUDA" ) element_size = torch.tensor([], dtype=dtype).element_size() @@ -77,6 +106,33 @@ def __init__( ), ) + def can_scale(self, kv_scales: KVScales) -> bool: + """Check if the cache can be scaled by the given scales.""" + if kv_scales.key_scale_cpu == 1.0 and kv_scales.value_scale_cpu == 1.0: + return False + elif ( + self.dtype == torch.float8_e4m3fn + and ATTENTION == "flashinfer" + and SYSTEM == "cuda" + ): + log_once( + logger.info, + "Using FP8 KV cache scales", + ) + return True + else: + # We have scales, but not the correct FP8 cache type, so warn once. + log_once( + logger.info, + "Ignoring FP8 KV cache scales, only float8_e4m3fn KV cache on flashinfer is supported", + ) + return False + + @property + def dtype(self): + """Get the data type of the cache.""" + return self.kv_cache[0].dtype + @property def key(self): """Get the key cache.""" @@ -95,18 +151,34 @@ def store( key: torch.Tensor, value: torch.Tensor, slots: torch.Tensor, + kv_scales: KVScales, ): """Store the key and value at the given slots.""" key_cache = self.kv_cache[0] value_cache = self.kv_cache[1] + if self.can_scale(kv_scales): + if kv_scales.key_scale_cpu != 1.0: + key = fp8_quantize( + key.float(), + scale=kv_scales.key_scale, + qdtype=self.dtype, + scalar=True, + )[0] + if kv_scales.value_scale_cpu != 1.0: + value = fp8_quantize( + value.float(), + scale=kv_scales.value_scale, + qdtype=self.dtype, + scalar=True, + )[0] + if ATTENTION in {"flashdecoding", "flashinfer"}: - # TODO: add scale key = key.to(key_cache.dtype) value = value.to(value_cache.dtype) - if key_cache.dtype == torch.float8_e5m2: - # Torch index_put does not support float8_e5m2 yet, so + if key_cache.dtype in {torch.float8_e4m3fn, torch.float8_e5m2}: + # Torch index_put does not support float8_{e5m2,e4m3fn} yet, so # put as raw data instead. key_cache = key_cache.view(torch.uint8) value_cache = value_cache.view(torch.uint8) @@ -116,4 +188,59 @@ def store( key_cache.view(-1, shape[-2], shape[-1])[slots] = key value_cache.view(-1, shape[-2], shape[-1])[slots] = value else: - reshape_and_cache(key, value, key_cache, value_cache, slots) + paged_reshape_and_cache(key, value, key_cache, value_cache, slots) + + +def paged_reshape_and_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slots: torch.Tensor, +): + if SYSTEM == "cuda": + try: + from vllm._C import cache_ops + except Exception as e: + raise ImportError( + f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" + ) + cache_ops.reshape_and_cache( + key, value, key_cache, value_cache, slots, "auto", 1.0 + ) + elif SYSTEM == "rocm": + try: + import vllm._custom_ops as ops + except Exception as e: + raise ImportError( + f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" + ) + ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0) + elif SYSTEM == "ipex": + import intel_extension_for_pytorch as ipex + + ipex.llm.modules.PagedAttention.reshape_and_cache( + key, value, key_cache, value_cache, slots + ) + else: + raise NotImplementedError( + f"Cannot reshape and cache for paged attention, system '{SYSTEM}' not supported" + ) + + +def get_kv_scales(weights: Weights, prefix: str) -> KVScales: + """Load KV cache scales.""" + + key_scale = torch.tensor(1.0, dtype=torch.float32, device=weights.device) + value_scale = key_scale + if weights.has_tensor(f"{prefix}.k_scale") and weights.has_tensor( + f"{prefix}.v_scale" + ): + key_scale = weights.get_tensor(f"{prefix}.k_scale", to_dtype=False).float() + value_scale = weights.get_tensor(f"{prefix}.v_scale", to_dtype=False).float() + elif weights.has_tensor(f"{prefix}.kv_scale"): + # Fall back to older more coarse-grained scale when available. + key_scale = weights.get_tensor(f"{prefix}.kv_scale").float() + value_scale = key_scale + + return KVScales(key_scale=key_scale, value_scale=value_scale) diff --git a/server/text_generation_server/layers/attention/rocm.py b/server/text_generation_server/layers/attention/rocm.py index 01d4685a7ad..ea11c2c2615 100644 --- a/server/text_generation_server/layers/attention/rocm.py +++ b/server/text_generation_server/layers/attention/rocm.py @@ -1,8 +1,8 @@ import os from typing import Optional import torch +from text_generation_server.layers.attention.kv_cache import KVCache, KVScales from text_generation_server.utils.import_utils import SYSTEM -from text_generation_server.models.globals import ATTENTION from text_generation_server.layers.attention import Seqlen from text_generation_server.utils.log import log_master from loguru import logger @@ -16,8 +16,6 @@ use_triton = os.getenv("ROCM_USE_FLASH_ATTN_V2_TRITON", "").lower() in {"true", "1"} ENGINE = "triton" if use_triton else "ck" -PREFILL_IN_KV_CACHE = False - use_rocm_custom_paged_attn = os.getenv("ROCM_USE_CUSTOM_PAGED_ATTN", "1") != "0" try: if use_rocm_custom_paged_attn: @@ -29,38 +27,17 @@ ) use_rocm_custom_paged_attn = False -try: - import vllm._custom_ops as ops -except Exception as e: - raise ImportError( - f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" - ) - - -def reshape_and_cache( - key: torch.Tensor, - value: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - slots: torch.Tensor, -): - if ATTENTION == "flashdecoding": - shape = key_cache.shape - key_cache.view(-1, shape[-2], shape[-1])[slots] = key - value_cache.view(-1, shape[-2], shape[-1])[slots] = value - else: - ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "auto", 1.0) - def paged_attention( query: torch.Tensor, - key_cache: torch.Tensor, - value_cache: torch.Tensor, + kv_cache: KVCache, kv_head_mapping: torch.Tensor, softmax_scale: float, block_tables: torch.Tensor, seqlen: Seqlen, max_s: int, + *, + kv_scales: KVScales, softcap: Optional[float] = None, ): # Adapted from: https://github.com/vllm-project/vllm/blob/f8a1e39fae05ca610be8d5a78be9d40f5274e5fc/vllm/model_executor/layers/attention.py @@ -84,10 +61,10 @@ def paged_attention( raise RuntimeError("Paged attention doesn't support softcapping") # value_cache => [num_blocks, num_heads, head_size, block_size] - block_size = value_cache.shape[3] + block_size = kv_cache.value.shape[3] num_seqs, num_heads, head_size = query.shape - num_kv_heads = key_cache.shape[1] + num_kv_heads = kv_cache.key.shape[1] gqa_ratio = num_heads // num_kv_heads use_custom = ( use_rocm_custom_paged_attn @@ -104,7 +81,7 @@ def paged_attention( _PARTITION_SIZE = _PARTITION_SIZE_CUSTOM max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE - input_lengths = seqlen.input_lengths + input_lengths = seqlen.input_lengths + seqlen.cache_lengths out = torch.empty_like(query) @@ -124,8 +101,8 @@ def paged_attention( ops.paged_attention_v1( out, query, - key_cache, - value_cache, + kv_cache.key, + kv_cache.value, kv_head_mapping, softmax_scale, block_tables, @@ -158,8 +135,8 @@ def paged_attention( max_logits, tmp_output, query, - key_cache, - value_cache, + kv_cache.key, + kv_cache.value, kv_head_mapping, softmax_scale, block_tables, @@ -177,8 +154,8 @@ def paged_attention( max_logits, tmp_output, query, - key_cache, - value_cache, + kv_cache.key, + kv_cache.value, num_kv_heads, softmax_scale, block_tables, @@ -227,29 +204,36 @@ def paged_attention( SUPPORTS_WINDOWING = False -if ENGINE == "ck": - - def attention( - q, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - seqlen: Seqlen, - block_tables: torch.Tensor, - softmax_scale: float, - window_size_left: int = -1, - causal: bool = True, - softcap: float = 0.0, - ): + + +def attention( + *, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: KVCache, + kv_scales: KVScales, + seqlen: Seqlen, + block_tables: torch.Tensor, + softmax_scale: float, + window_size_left: int = -1, + causal: bool = True, + softcap: Optional[float] = None, +): + if ENGINE == "ck": if window_size_left <= 0 and window_size_left != -1: raise ValueError("`window_size_left` must be > 0 or -1") - out = torch.empty_like(q) + out = torch.empty_like(query) + + if softcap is None: + softcap = 0.0 # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load. return flash_attn_2_cuda.varlen_fwd( - q, - key_cache, - value_cache, + query, + key, + value, out, seqlen.cu_seqlen_q, seqlen.cu_seqlen_q, @@ -270,30 +254,19 @@ def attention( None, )[0] -elif ENGINE == "triton": - from .flash_attn_triton import triton_attention - - def attention( - q, - key_cache: torch.Tensor, - value_cache: torch.Tensor, - seqlen: Seqlen, - block_tables: torch.Tensor, - softmax_scale: float, - window_size_left: int = -1, - causal: bool = True, - softcap: Optional[float] = None, - ): + elif ENGINE == "triton": + from .flash_attn_triton import triton_attention + if softcap is not None: raise NotImplementedError("softcap is only available with CK flash attn") - out = torch.empty_like(q) + out = torch.empty_like(query) # We do not need to check window_size_left (not supported) here, so it is already checked ahead of time at model load. output, _ = triton_attention( - q, - key_cache, - value_cache, + query, + key, + value, out, seqlen.cu_seqlen_q, seqlen.cu_seqlen_q, @@ -304,13 +277,12 @@ def attention( ) return output -else: - raise RuntimeError(f"Unknown attention engine {ENGINE}") + else: + raise RuntimeError(f"Unknown attention engine {ENGINE}") + __all__ = [ - "PREFILL_IN_KV_CACHE", "SUPPORTS_WINDOWING", "attention", "paged_attention", - "reshape_and_cache", ] diff --git a/server/text_generation_server/layers/awq/quantize/__init__.py b/server/text_generation_server/layers/awq/quantize/__init__.py new file mode 100644 index 00000000000..3e72881bb2b --- /dev/null +++ b/server/text_generation_server/layers/awq/quantize/__init__.py @@ -0,0 +1,8 @@ +from text_generation_server.utils.import_utils import SYSTEM + +if SYSTEM == "ipex": + from .ipex import WQLinear +elif SYSTEM == "cuda": + from .cuda import WQLinear + +__all__ = ["WQLinear"] diff --git a/server/text_generation_server/layers/awq/quantize/qmodule.py b/server/text_generation_server/layers/awq/quantize/cuda.py similarity index 100% rename from server/text_generation_server/layers/awq/quantize/qmodule.py rename to server/text_generation_server/layers/awq/quantize/cuda.py diff --git a/server/text_generation_server/layers/awq/quantize/ipex.py b/server/text_generation_server/layers/awq/quantize/ipex.py new file mode 100644 index 00000000000..84cd7a2190d --- /dev/null +++ b/server/text_generation_server/layers/awq/quantize/ipex.py @@ -0,0 +1,48 @@ +from typing import Optional +import torch +import torch.nn as nn +import intel_extension_for_pytorch as ipex + + +class WQLinear(nn.Module): + def __init__( + self, w_bit, group_size, qweight, qzeros, scales, bias: Optional[torch.Tensor] + ): + super().__init__() + + if w_bit not in [4]: + raise NotImplementedError("Only 4-bit are supported for now.") + + self.in_features = qweight.shape[0] + self.out_features = qweight.shape[1] * 32 // w_bit + + self.w_bit = w_bit + self.group_size = group_size if group_size != -1 else self.in_features + # quick sanity check (make sure aligment) + assert self.in_features % self.group_size == 0 + assert self.out_features % (32 // self.w_bit) == 0 + + self.qweight = qweight + self.qzeros = qzeros + self.scales = scales + self.bias = bias + self.woq_linear = ( + ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight( + self.qweight, + self.scales, + self.qzeros, + self.in_features, + self.out_features, + bias=self.bias, + group_size=self.group_size, + quant_method=ipex.llm.quantization.QuantMethod.AWQ_GEMM, + dtype=ipex.llm.quantization.QuantDtype.INT4, + ) + ) + + @torch.no_grad() + def forward(self, x): + out_shape = x.shape[:-1] + (self.out_features,) + out = self.woq_linear(x.reshape(-1, x.shape[-1])) + out = out + self.bias if self.bias is not None else out + return out.reshape(out_shape) diff --git a/server/text_generation_server/layers/fp8.py b/server/text_generation_server/layers/fp8.py index 61dd5115157..a58c7f7b223 100644 --- a/server/text_generation_server/layers/fp8.py +++ b/server/text_generation_server/layers/fp8.py @@ -1,7 +1,7 @@ import torch from dataclasses import dataclass -from typing import Optional, Union, List +from typing import Optional, Tuple, Union, List from loguru import logger from text_generation_server.utils.import_utils import SYSTEM @@ -26,6 +26,12 @@ def is_fbgemm_gpu_available(): return False +try: + import marlin_kernels +except ImportError: + marlin_kernels = None + + if is_fbgemm_gpu_available(): if SYSTEM == "cuda": major, _ = torch.cuda.get_device_capability() @@ -51,27 +57,82 @@ def get_fp8_linear() -> torch.nn.Module: return Fp8Linear +def normalize_e4m3fn_to_e4m3fnuz( + weight: torch.Tensor, + weight_scale: torch.Tensor, + input_scale: Optional[torch.Tensor] = None, +) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]: + assert weight.dtype == torch.float8_e4m3fn + # The bits pattern 10000000(-128) represents zero in e4m3fn + # but NaN in e4m3fnuz. So here we set it to 0. + # https://onnx.ai/onnx/technical/float8.html + weight_as_int8 = weight.view(torch.int8) + ROCM_FP8_NAN_AS_INT = -128 + weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0 + weight = weight_as_int8.view(torch.float8_e4m3fnuz) + + # For the same bits representation, e4m3fnuz value is half of + # the e4m3fn value, so we should double the scaling factor to + # get the same dequantized value. + # https://onnx.ai/onnx/technical/float8.html + weight_scale = weight_scale * 2.0 + if input_scale is not None: + input_scale = input_scale * 2.0 + return weight, weight_scale, input_scale + + def fp8_quantize( - weight, scale_upper_bound=None, qdtype=torch.float8_e4m3fn, scalar=False + weight: torch.Tensor, + scale: Optional[torch.Tensor] = None, + scale_upper_bound: Optional[torch.Tensor] = None, + qdtype: torch.dtype = torch.float8_e4m3fn, + scalar: bool = False, ): - if FBGEMM_DYN_AVAILABLE and not scalar: + """ + This function returns a reciprocal of the scale, so that a tensor can be unscaled + by multiplying it with the returned scale. If a scale is given through the `scale` + argument, it must also be a reciprocal (so that scales from an FP8 checkpoint can + be used without modification). + """ + if FBGEMM_DYN_AVAILABLE and not scalar and not scale: qweight, scale = torch.ops.fbgemm.quantize_fp8_per_row( weight, bs=None, scale_ub=scale_upper_bound, output_dtype=qdtype ) return qweight, scale + if marlin_kernels is not None: + shape = weight.shape + qweight, scale = marlin_kernels.scaled_fp8_quant( + weight.reshape(-1, shape[-1]), + dtype=qdtype, + scale=scale, + scale_ub=scale_upper_bound, + ) + + return qweight.reshape(shape), scale + # weight, scale = quant_weights(weight, torch.int8, False) finfo = torch.finfo(qdtype) - # Calculate the scale as dtype max divided by absmax - scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound) - # scale and clamp the tensor to bring it to - # the representative range of float8 data type - # (as default cast is unsaturated) - qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max) + + if scale is None: + # Calculate the scale as dtype max divided by absmax + scale = finfo.max / weight.abs().max().clamp(min=1e-12, max=scale_upper_bound) + # scale and clamp the tensor to bring it to + # the representative range of float8 data type + # (as default cast is unsaturated) + qweight = (weight * scale).clamp(min=finfo.min, max=finfo.max) + scale = scale.float().reciprocal() + else: + # Use reciprocal to avoid more expensive division. + qweight = (weight * scale.reciprocal()).clamp(min=finfo.min, max=finfo.max) + # Return both float8 data and the inverse scale (as float), # as both required as inputs to torch._scaled_mm qweight = qweight.to(qdtype) - scale = scale.float().reciprocal() + + if SYSTEM == "rocm": + qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz(qweight, scale) + return qweight, scale @@ -92,9 +153,17 @@ def get_weights(self, weights: "Weights", prefix: str): .reshape(-1) .expand(w.shape[0]) ) + + input_scale = None + if weights.has_tensor(f"{prefix}.input_scale"): + input_scale = weights.get_tensor( + f"{prefix}.input_scale", to_dtype=False + ).reshape(-1) + return Fp8Weight( weight=w, weight_scale=scale, + input_scale=input_scale, activation_scale_ub=self.activation_scale_ub, dtype=weights.dtype, ) @@ -125,9 +194,24 @@ def get_weights_col_packed( ) scale = scale.reshape(-1).expand(w.shape[0]) + input_scale = None + if weights.has_tensor(f"{prefix}.input_scale"): + input_scale = weights.get_tensor( + f"{prefix}.input_scale", to_dtype=False + ) + if input_scale.numel() > 1: + input_scale = weights.get_packed_sharded( + f"{prefix}.input_scale", + dim=0, + block_sizes=block_sizes, + to_dtype=False, + ) + input_scale = input_scale.reshape(-1).max() + return Fp8Weight( weight=w, weight_scale=scale, + input_scale=input_scale, activation_scale_ub=self.activation_scale_ub, dtype=weights.dtype, ) @@ -154,9 +238,22 @@ def get_multi_weights_col(self, weights: "Weights", prefixes: List[str], dim: in ] scale = torch.cat(scale, dim=0).reshape(-1) + input_scale = [ + _load_scalar_or_matrix_scale(weights, f"{p}.input_scale", shape) + for p, shape in zip(prefixes, shapes) + if weights.has_tensor(f"{p}.input_scale") + ] + assert len(input_scale) == 0 or len(input_scale) == len(prefixes) + input_scale = ( + torch.cat(input_scale, dim=0).reshape(-1).max() + if len(input_scale) != 0 + else None + ) + return Fp8Weight( weight=w, weight_scale=scale, + input_scale=input_scale, activation_scale_ub=self.activation_scale_ub, dtype=weights.dtype, ) @@ -174,9 +271,16 @@ def get_weights_row(self, weights: "Weights", prefix: str): .reshape(-1) .expand(w.shape[0]) ) + input_scale = None + if weights.has_tensor(f"{prefix}.input_scale"): + input_scale = weights.get_tensor( + f"{prefix}.input_scale", to_dtype=False + ).reshape(-1) + return Fp8Weight( weight=w, weight_scale=scale, + input_scale=input_scale, activation_scale_ub=self.activation_scale_ub, dtype=weights.dtype, ) @@ -191,6 +295,7 @@ class Fp8Weight(Weight): weight: torch.Tensor dtype: torch.dtype weight_scale: Optional[torch.Tensor] = None + input_scale: Optional[torch.Tensor] = None activation_scale_ub: Optional[float] = None def get_linear(self, bias: torch.Tensor): @@ -200,33 +305,50 @@ def get_linear(self, bias: torch.Tensor): # memory. Can be non-contiguous when we e.g. expand from scalars. self.weight_scale = self.weight_scale.contiguous() return get_fp8_linear().from_fp8( - self.weight, self.weight_scale, self.activation_scale_ub, bias, self.dtype + weight=self.weight, + scale=self.weight_scale, + dtype=self.dtype, + bias=bias, + input_scale=self.input_scale, + scale_upper_bound=self.activation_scale_ub, ) class Fp8Linear(torch.nn.Module): + _device_identity_cache = {} + def __init__( self, - qweight, - scale, - scale_upper_bound, - bias, - dtype, + qweight: torch.Tensor, + scale: torch.Tensor, + dtype: torch.dtype, + bias: Optional[torch.Tensor] = None, + input_scale: Optional[torch.Tensor] = None, + scale_upper_bound: Optional[float] = None, ) -> None: super().__init__() if FBGEMM_MM_AVAILABLE: log_once(logger.info, "Using FBGEMM fp8 optimized kernels") + if SYSTEM == "rocm" and qweight.dtype == torch.float8_e4m3fn: + qweight, scale, _ = normalize_e4m3fn_to_e4m3fnuz( + weight=qweight, weight_scale=scale + ) self.dtype = dtype self.qweight = qweight - self.scale = scale - self.scale_upper_bound = ( - torch.tensor( - [scale_upper_bound], dtype=torch.float32, device=qweight.device + self.scale = scale.float() + self.input_scale = input_scale.float() if input_scale is not None else None + + if FBGEMM_MM_AVAILABLE: + self.scale_upper_bound = ( + torch.tensor( + [scale_upper_bound], dtype=torch.float32, device=qweight.device + ) + if scale_upper_bound is not None + else None ) - if scale_upper_bound is not None - else None - ) + else: + self.scale_upper_bound = scale_upper_bound self.bias = bias if bias is not None else None @@ -234,22 +356,46 @@ def __init__( def from_unquant(cls, weight, bias, dtype): qweight, scale = fp8_quantize(weight, scalar=not FBGEMM_MM_AVAILABLE) return cls( - qweight=qweight, scale=scale, scale_upper_bound=None, bias=bias, dtype=dtype + qweight=qweight, + scale=scale, + dtype=dtype, + bias=bias, + input_scale=None, + scale_upper_bound=None, ) @classmethod - def from_fp8(cls, weight, scale, input_scale, bias, dtype): + def from_fp8( + cls, + weight: torch.Tensor, + scale: torch.Tensor, + dtype: torch.dtype, + bias: Optional[torch.Tensor] = None, + **kwargs, + ) -> "Fp8Linear": + input_scale = kwargs.get("input_scale", None) + scale_upper_bound = kwargs.get("scale_upper_bound", None) + if FBGEMM_DYN_AVAILABLE: # fbgemm needs float32 scales. scale = scale.float() return cls( qweight=weight, scale=scale, - scale_upper_bound=input_scale, + input_scale=input_scale, + scale_upper_bound=scale_upper_bound, bias=bias, dtype=dtype, ) + @classmethod + def get_shared_device_identity(cls, device): + # Input scaling factors are no longer optional in _scaled_mm starting + # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale + if device not in cls._device_identity_cache: + cls._device_identity_cache[device] = torch.ones(1, device=device) + return cls._device_identity_cache[device] + def forward(self, input: torch.Tensor) -> torch.Tensor: if FBGEMM_MM_AVAILABLE: qinput, scale = fp8_quantize( @@ -266,15 +412,49 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: ) return y.to(self.dtype) - qinput, scale = fp8_quantize(input, scalar=True) - output, _ = torch._scaled_mm( - qinput, - self.qweight.t(), - out_dtype=self.dtype, - scale_a=scale, - scale_b=self.scale, - bias=self.bias, + qinput, scale = fp8_quantize( + input, + self.input_scale, + scale_upper_bound=self.scale_upper_bound, + scalar=True, ) + + per_tensor_weights = self.scale.numel() == 1 + per_tensor_activations = scale.numel() == 1 + + if SYSTEM != "rocm" or (per_tensor_weights and per_tensor_activations): + output = torch._scaled_mm( + qinput, + self.qweight.t(), + out_dtype=self.dtype, + scale_a=scale, + scale_b=self.scale, + bias=self.bias, + ) + + if isinstance(output, tuple) and len(output) == 2: + output = output[0] + else: + device_identity = None + if SYSTEM == "rocm": + device_identity = self.get_shared_device_identity(self.qweight.device) + + output = torch._scaled_mm( + qinput, + self.qweight.t(), + scale_a=device_identity, + scale_b=device_identity, + out_dtype=torch.float32, + ) + if isinstance(output, tuple) and len(output) == 2: + output = output[0] + + output = output * scale * self.scale.t() + if self.bias is not None: + output = output + self.bias + + output = output.to(dtype=self.dtype) + return output diff --git a/server/text_generation_server/layers/gptq/__init__.py b/server/text_generation_server/layers/gptq/__init__.py index 505caa59a0d..7e838035214 100644 --- a/server/text_generation_server/layers/gptq/__init__.py +++ b/server/text_generation_server/layers/gptq/__init__.py @@ -8,6 +8,11 @@ from text_generation_server.utils.log import log_once from text_generation_server.utils.weights import Weight, Weights, WeightsLoader +if SYSTEM == "ipex": + from .ipex import QuantLinear +elif SYSTEM in {"cuda", "rocm"}: + from .triton import QuantLinear + @dataclass class GPTQWeight(Weight): @@ -36,7 +41,7 @@ def get_linear(self, bias: torch.Tensor): "to use Exllama/GPTQ kernels for AWQ inference." ) try: - from text_generation_server.layers.awq.quantize.qmodule import WQLinear + from text_generation_server.layers.awq.quantize import WQLinear return WQLinear( w_bit=self.bits, @@ -60,8 +65,6 @@ def get_linear(self, bias: torch.Tensor): return ExllamaQuantLinear(self, bias) else: - from text_generation_server.layers.gptq.quant_linear import QuantLinear - return QuantLinear( self.qweight, self.qzeros, @@ -298,6 +301,7 @@ def get_weights_row(self, weights: Weights, prefix: str): self._get_gptq_params(weights) use_exllama = True + desc_act = self.desc_act if self.bits != 4: use_exllama = False @@ -321,7 +325,8 @@ def get_weights_row(self, weights: Weights, prefix: str): if g_idx is not None: if ( not torch.equal( - g_idx.cpu(), + # Remove g_idx[0] to adapt the check with TP>1. + (g_idx - g_idx[0]).cpu(), torch.tensor( [i // self.groupsize for i in range(g_idx.shape[0])], dtype=torch.int32, @@ -332,6 +337,7 @@ def get_weights_row(self, weights: Weights, prefix: str): # Exllama implementation does not support row tensor parallelism with act-order, as # it would require to reorder input activations that are split unto several GPUs use_exllama = False + desc_act = True from text_generation_server.layers.gptq import ( CAN_EXLLAMA, @@ -350,16 +356,16 @@ def get_weights_row(self, weights: Weights, prefix: str): else: log_once(logger.info, f"Using exllama kernels v{HAS_EXLLAMA}") - if use_exllama and self.groupsize != -1: + if not desc_act and self.groupsize != -1: qzeros = weights.get_sharded(f"{prefix}.qzeros", dim=0) scales = weights.get_sharded(f"{prefix}.scales", dim=0) + if g_idx is not None: + # qzeros, scales sharded, and g_idx must be adjusted accordingly + g_idx = g_idx - g_idx[0] else: qzeros = weights.get_tensor(f"{prefix}.qzeros") scales = weights.get_tensor(f"{prefix}.scales") - if use_exllama and g_idx is not None: - g_idx = g_idx - g_idx[0] - if self.quantize == "gptq" and self.quant_method == "awq": log_once( logger.info, "Converting AWQ model to Exllama/GPTQ packing format." @@ -392,7 +398,7 @@ def get_weights_row(self, weights: Weights, prefix: str): ) def _get_gptq_params(self, weights: Weights): - if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"): + if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"): self.bits = weights.get_tensor("gptq_bits").item() self.groupsize = weights.get_tensor("gptq_groupsize").item() self.desc_act = False @@ -400,7 +406,7 @@ def _get_gptq_params(self, weights: Weights): # before the `gptq_sym` setting tensor was added. self.sym = ( weights.get_tensor("gptq_sym").item() - if weights._has_tensor("gptq_sym") + if weights.has_tensor("gptq_sym") else False ) self.quant_method = "gptq" diff --git a/server/text_generation_server/layers/gptq/ipex.py b/server/text_generation_server/layers/gptq/ipex.py new file mode 100644 index 00000000000..ab9c9e24752 --- /dev/null +++ b/server/text_generation_server/layers/gptq/ipex.py @@ -0,0 +1,126 @@ +import math +import numpy as np +import torch +import torch.nn as nn + +import intel_extension_for_pytorch as ipex + + +class QuantLinear(nn.Module): + def __init__(self, qweight, qzeros, scales, g_idx, bias, bits, groupsize): + super().__init__() + self.register_buffer("qweight", qweight) + self.register_buffer("qzeros", qzeros) + self.register_buffer("scales", scales) + self.register_buffer("g_idx", g_idx) + if bias is not None: + self.register_buffer("bias", bias) + else: + self.bias = None + if bits not in [4]: + raise NotImplementedError("Only 4 bits are supported.") + self.bits = bits + self.maxq = 2**self.bits - 1 + self.groupsize = groupsize + + self.outfeatures = qweight.shape[1] + self.infeatures = qweight.shape[0] * 32 // bits + self.woq_linear = ( + ipex.llm.quantization.IPEXWeightOnlyQuantizedLinear.from_weight( + self.qweight, + self.scales, + self.qzeros, + self.infeatures, + self.outfeatures, + bias=self.bias, + group_size=self.groupsize, + g_idx=g_idx, + quant_method=ipex.llm.quantization.QuantMethod.GPTQ_GEMM, + dtype=ipex.llm.quantization.QuantDtype.INT4, + ) + ) + + @classmethod + def new(cls, bits, groupsize, infeatures, outfeatures, bias): + if bits not in [4]: + raise NotImplementedError("Only 4 bits are supported.") + + qweight = torch.zeros((infeatures // 32 * bits, outfeatures), dtype=torch.int32) + qzeros = torch.zeros( + (math.ceil(infeatures / groupsize), outfeatures // 32 * bits), + dtype=torch.int32, + ) + scales = torch.zeros( + (math.ceil(infeatures / groupsize), outfeatures), dtype=torch.float16 + ) + g_idx = torch.tensor( + [i // groupsize for i in range(infeatures)], dtype=torch.int32 + ) + if bias: + bias = torch.zeros((outfeatures), dtype=torch.float16) + else: + bias = None + return cls(qweight, qzeros, scales, g_idx, bias, bits, groupsize) + + def pack(self, linear, scales, zeros, g_idx=None): + self.g_idx = g_idx.clone() if g_idx is not None else self.g_idx + + scales = scales.t().contiguous() + zeros = zeros.t().contiguous() + scale_zeros = zeros * scales + self.scales = scales.clone().half() + if linear.bias is not None: + self.bias = linear.bias.clone().half() + + intweight = [] + for idx in range(self.infeatures): + intweight.append( + torch.round( + (linear.weight.data[:, idx] + scale_zeros[self.g_idx[idx]]) + / self.scales[self.g_idx[idx]] + ).to(torch.int)[:, None] + ) + intweight = torch.cat(intweight, dim=1) + intweight = intweight.t().contiguous() + intweight = intweight.numpy().astype(np.uint32) + qweight = np.zeros( + (intweight.shape[0] // 32 * self.bits, intweight.shape[1]), dtype=np.uint32 + ) + i = 0 + row = 0 + while row < qweight.shape[0]: + if self.bits in [4]: + for j in range(i, i + (32 // self.bits)): + qweight[row] |= intweight[j] << (self.bits * (j - i)) + i += 32 // self.bits + row += 1 + else: + raise NotImplementedError("Only 4 bits are supported.") + + qweight = qweight.astype(np.int32) + self.qweight = torch.from_numpy(qweight) + + zeros -= 1 + zeros = zeros.numpy().astype(np.uint32) + qzeros = np.zeros( + (zeros.shape[0], zeros.shape[1] // 32 * self.bits), dtype=np.uint32 + ) + i = 0 + col = 0 + while col < qzeros.shape[1]: + if self.bits in [4]: + for j in range(i, i + (32 // self.bits)): + qzeros[:, col] |= zeros[:, j] << (self.bits * (j - i)) + i += 32 // self.bits + col += 1 + else: + raise NotImplementedError("Only 4 bits are supported.") + + qzeros = qzeros.astype(np.int32) + self.qzeros = torch.from_numpy(qzeros) + + def forward(self, x): + out_shape = x.shape[:-1] + (self.outfeatures,) + out = self.woq_linear(x.reshape(-1, x.shape[-1])) + out = out + self.bias if self.bias is not None else out + return out.reshape(out_shape) diff --git a/server/text_generation_server/layers/gptq/quantize.py b/server/text_generation_server/layers/gptq/quantize.py index b0086ea08b3..66fc15ec0e3 100644 --- a/server/text_generation_server/layers/gptq/quantize.py +++ b/server/text_generation_server/layers/gptq/quantize.py @@ -12,7 +12,7 @@ from accelerate import init_empty_weights from text_generation_server.utils import initialize_torch_distributed, Weights from text_generation_server.utils.hub import weight_files -from text_generation_server.layers.gptq.quant_linear import QuantLinear +from text_generation_server.layers.gptq import QuantLinear from loguru import logger from typing import Optional from text_generation_server.layers.gptq.utils import torch_snr_error diff --git a/server/text_generation_server/layers/gptq/quant_linear.py b/server/text_generation_server/layers/gptq/triton.py similarity index 100% rename from server/text_generation_server/layers/gptq/quant_linear.py rename to server/text_generation_server/layers/gptq/triton.py diff --git a/server/text_generation_server/layers/marlin/fp8.py b/server/text_generation_server/layers/marlin/fp8.py index fe55a58a33e..49f5c480f3f 100644 --- a/server/text_generation_server/layers/marlin/fp8.py +++ b/server/text_generation_server/layers/marlin/fp8.py @@ -62,7 +62,14 @@ def from_unquant(cls, weight, bias, dtype): return cls(qweight=qweight, scales=scales.to(dtype), bias=bias) @classmethod - def from_fp8(cls, weight, scale, _input_scale, bias, dtype): + def from_fp8( + cls, + weight: torch.Tensor, + scale: torch.Tensor, + bias: torch.Tensor, + dtype: torch.dtype, + **kwargs, + ): return cls(qweight=weight, scales=scale.to(dtype), bias=bias) def forward(self, A: torch.Tensor) -> torch.Tensor: diff --git a/server/text_generation_server/layers/marlin/gptq.py b/server/text_generation_server/layers/marlin/gptq.py index 7245431f3ab..47341c0f0eb 100644 --- a/server/text_generation_server/layers/marlin/gptq.py +++ b/server/text_generation_server/layers/marlin/gptq.py @@ -231,7 +231,7 @@ def get_weights_row(self, weights: Weights, prefix: str): ) def _get_gptq_params(self, weights: Weights): - if weights._has_tensor("gptq_bits") and weights._has_tensor("gptq_groupsize"): + if weights.has_tensor("gptq_bits") and weights.has_tensor("gptq_groupsize"): self.bits = weights.get_tensor("gptq_bits").item() self.groupsize = weights.get_tensor("gptq_groupsize").item() self.desc_act = False @@ -239,7 +239,7 @@ def _get_gptq_params(self, weights: Weights): # before the `gptq_sym` setting tensor was added. self.sym = ( weights.get_tensor("gptq_sym").item() - if weights._has_tensor("gptq_sym") + if weights.has_tensor("gptq_sym") else False ) self.quant_method = "gptq" diff --git a/server/text_generation_server/models/__init__.py b/server/text_generation_server/models/__init__.py index 17eed976467..d30154083f5 100644 --- a/server/text_generation_server/models/__init__.py +++ b/server/text_generation_server/models/__init__.py @@ -195,6 +195,11 @@ class ModelType(enum.Enum): "name": "Phi 3", "url": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", } + GRANITE = { + "type": "granite", + "name": "Granite", + "url": "https://huggingface.co/ibm-granite/granite-3.0-8b-instruct", + } GEMMA = { "type": "gemma", "name": "Gemma", @@ -357,17 +362,32 @@ def get_model( compression_config = config_dict.get("compression_config", None) if quantization_config is not None and quantize is None: method = quantization_config.get("quant_method", None) + config_groups = quantization_config.get("config_groups", None) if method in {"gptq", "awq", "exl2"}: log_master(logger.info, f"Auto selecting quantization method {method}") quantize = method - elif method == "fbgemm_fp8": + elif method == "fbgemm_fp8" or method == "fp8": log_master(logger.info, "Auto selecting quantization method fp8") quantize = "fp8" + elif config_groups is not None: + # TODO: at some point we should probably fully parse the compression + # configuration to know which parameters are compressed. + for _, group in config_groups.items(): + weights_config = group.get("weights") + if weights_config is not None: + if ( + weights_config["type"] == "float" + and weights_config["num_bits"] == 8 + ): + log_master( + logger.info, "Auto selecting quantization method fp8" + ) + quantize = "fp8" + break else: log_master(logger.warning, f"Unknown quantization method {method}") elif compression_config is not None: - # TODO: at some point we should probably fully parse the compression - # configuration to know which parameters are compressed. + # `compression_config` renamed to `quantization_config`; support retained for backward compatibility. config_groups = compression_config.get("config_groups") if config_groups is not None: for _, group in config_groups.items(): @@ -385,8 +405,11 @@ def get_model( if dtype is None: if quantize in ["awq", "exl2", "gptq", "marlin"]: - # These quantizers only work with float16 params. - dtype = torch.float16 + if SYSTEM == "ipex" and not hasattr(torch, "xpu"): + dtype = torch.bfloat16 + else: + # These quantizers only work with float16 params. + dtype = torch.float16 elif quantize == "fp8": from text_generation_server.layers.fp8 import FBGEMM_DYN_AVAILABLE @@ -406,6 +429,8 @@ def get_model( if kv_cache_dtype is None: kv_cache_dtype = dtype + elif kv_cache_dtype == "fp8_e4m3fn": + kv_cache_dtype = torch.float8_e4m3fn elif kv_cache_dtype == "fp8_e5m2": kv_cache_dtype = torch.float8_e5m2 else: @@ -842,7 +867,12 @@ def get_model( trust_remote_code=trust_remote_code, ) - elif model_type == LLAMA or model_type == BAICHUAN or model_type == PHI3: + elif ( + model_type == LLAMA + or model_type == BAICHUAN + or model_type == PHI3 + or model_type == GRANITE + ): if FLASH_ATTENTION: return FlashCausalLM( model_id=model_id, @@ -856,7 +886,9 @@ def get_model( lora_adapter_ids=lora_adapter_ids, ) elif sharded: - raise NotImplementedError(FLASH_ATT_ERROR_MESSAGE.format("Sharded Llama")) + raise NotImplementedError( + FLASH_ATT_ERROR_MESSAGE.format(f"Sharded {model_type}") + ) else: return CausalLM.fallback( model_id, diff --git a/server/text_generation_server/models/causal_lm.py b/server/text_generation_server/models/causal_lm.py index de2c065160b..bd8176be9ed 100644 --- a/server/text_generation_server/models/causal_lm.py +++ b/server/text_generation_server/models/causal_lm.py @@ -76,6 +76,7 @@ def to_pb(self) -> generate_pb2.CachedBatch: request_ids=[r.id for r in self.requests], size=len(self), max_tokens=self.max_tokens, + current_tokens=len(self.input_ids), ) @classmethod @@ -619,18 +620,11 @@ def fallback( model_id, revision=revision, torch_dtype=dtype, - device_map=( - "auto" - if device_count > 1 - else None - ), + device_map=("auto" if device_count > 1 else None), load_in_8bit=quantize == "bitsandbytes", trust_remote_code=trust_remote_code, ) - if ( - device_count == 1 - and quantize != "bitsandbytes" - ): + if device_count == 1 and quantize != "bitsandbytes": model = model.to(device) if tokenizer.pad_token_id is None: diff --git a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py index d0425fec11f..68719106fca 100644 --- a/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_cohere_modeling.py @@ -30,6 +30,7 @@ attention, Seqlen, ) +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.layers import ( TensorParallelRowLinear, @@ -38,7 +39,6 @@ SpeculativeHead, get_linear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE from text_generation_server.layers.layernorm import ( FastLayerNorm, ) @@ -228,6 +228,7 @@ def __init__( ) self.query_key_value = load_attention(config, prefix, weights) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.use_qk_norm = config.use_qk_norm if self.use_qk_norm: @@ -290,30 +291,37 @@ def forward( self.rotary_emb(query, key, cos, sin) - kv_cache.store(key=key, value=value, slots=slots) + kv_cache.store( + key=key, + value=value, + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else key, - kv_cache.value if PREFILL_IN_KV_CACHE else value, - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=key, + value=value, + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py index b2b0cecbcca..f70bff4f881 100644 --- a/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_dbrx_modeling.py @@ -20,6 +20,7 @@ from transformers.activations import ACT2FN from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple, Any +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.utils.import_utils import SYSTEM if SYSTEM != "ipex": @@ -29,7 +30,6 @@ paged_attention, attention, Seqlen, - PREFILL_IN_KV_CACHE, ) from text_generation_server.layers import ( FastLinear, @@ -289,6 +289,7 @@ def __init__( ) self.query_key_value = load_attention(config, prefix, weights) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.o_proj = TensorParallelRowLinear.load( config, @@ -329,30 +330,37 @@ def forward( self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin) - kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots) + kv_cache.store( + key=kv[:, 0], + value=kv[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv[:, 0], + value=kv[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py index af77af8e139..906a83a4151 100644 --- a/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py @@ -34,7 +34,7 @@ attention, paged_attention, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import KVCache, get_kv_scales from text_generation_server.layers.layernorm import FastRMSNorm from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer from text_generation_server.layers.rotary import PositionRotaryEmbedding, get_mscale @@ -231,6 +231,8 @@ def __init__( ), ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") + self.kv_a_layernorm = FastRMSNorm.load( prefix=f"{prefix}.kv_a_layernorm", weights=weights, eps=config.rms_norm_eps ) @@ -259,7 +261,7 @@ def forward( cos: torch.Tensor, sin: torch.Tensor, cu_seqlen_prefill: torch.Tensor, - kv_cache: Tuple[torch.Tensor, torch.Tensor], + kv_cache: KVCache, block_tables: torch.Tensor, slots: torch.Tensor, seqlen: Seqlen, @@ -320,30 +322,37 @@ def forward( value, (0, self.head_pad_size - self.value_head_size), value=0 ) - kv_cache.store(key=key, value=value, slots=slots) + kv_cache.store( + key=key, + value=value, + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else key, - kv_cache.value if PREFILL_IN_KV_CACHE else value, - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=key, + value=value, + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) # Remove padding. diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py index 03b9b2a0201..ebf1b80eb03 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma2_modeling.py @@ -39,7 +39,7 @@ TensorParallelMultiAdapterLinear, TensorParallelAdapterRowLinear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.rotary import PositionRotaryEmbedding from text_generation_server.layers.layernorm import ( FastRMSNorm, @@ -207,6 +207,7 @@ def __init__( ], process_group=weights.process_group, ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") o_proj = TensorParallelRowLinear.load( config, @@ -252,19 +253,25 @@ def forward( self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin) - kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots) + kv_cache.store( + key=kv[:, 0], + value=kv[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1], - seqlen, - block_tables, - self.softmax_scale, - causal=self.causal, + query=query, + key=kv[:, 0], + value=kv[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, window_size_left=self.window_size, softcap=self.softcap, ) @@ -272,14 +279,14 @@ def forward( else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, softcap=self.softcap, + kv_scales=self.kv_scales, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py index f3c469012d5..ad3be80e51d 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gemma_modeling.py @@ -29,7 +29,6 @@ paged_attention, attention, Seqlen, - PREFILL_IN_KV_CACHE, ) from text_generation_server.layers import ( TensorParallelRowLinear, @@ -38,6 +37,7 @@ SpeculativeHead, get_linear, ) +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.rotary import PositionRotaryEmbedding from text_generation_server.layers.layernorm import ( FastRMSNorm, @@ -186,6 +186,7 @@ def __init__(self, prefix: str, config, weights, causal: bool): ) self.query_key_value = load_attention(config, prefix, weights) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.o_proj = TensorParallelRowLinear.load( config, @@ -223,31 +224,38 @@ def forward( self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin) - kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots) + kv_cache.store( + key=kv[:, 0], + value=kv[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv[:, 0], + value=kv[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, causal=self.causal, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py index 94a8898d0a8..906b34c12b2 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gpt2_modeling.py @@ -24,7 +24,6 @@ from torch import nn from transformers.activations import ACT2FN from typing import Optional, List, Tuple -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE from text_generation_server.layers.attention import ( paged_attention, attention, @@ -37,6 +36,7 @@ SpeculativeHead, get_linear, ) +from text_generation_server.layers.attention.kv_cache import get_kv_scales def load_qkv(config, prefix: str, weights, head_size, num_heads): @@ -194,6 +194,7 @@ def __init__( head_size=self.head_size, num_heads=self.num_heads, ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.o_proj = load_row( config, @@ -223,30 +224,37 @@ def forward( key = key.view(-1, self.num_heads, self.head_size) value = value.view(-1, self.num_heads, self.head_size) - kv_cache.store(key=key, value=value, slots=slots) + kv_cache.store( + key=key, + value=value, + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else key, - kv_cache.value if PREFILL_IN_KV_CACHE else value, - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=key, + value=value, + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py index f0a1270e0e9..692f8ca31be 100644 --- a/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_gptj_modeling.py @@ -24,6 +24,7 @@ from torch import nn from transformers.activations import ACT2FN from typing import Optional, List, Tuple +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.layers.attention import ( paged_attention, @@ -37,7 +38,6 @@ SpeculativeHead, get_linear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE from text_generation_server.layers.rotary import ( PositionRotaryEmbedding, ) @@ -139,6 +139,7 @@ def __init__( prefix=prefix, weights=weights, ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.o_proj = load_row( config, @@ -185,30 +186,37 @@ def forward( else: self.rotary_emb(query, key, cos, sin) - kv_cache.store(key=key, value=value, slots=slots) + kv_cache.store( + key=key, + value=value, + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else key, - kv_cache.value if PREFILL_IN_KV_CACHE else value, - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=key, + value=value, + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py index fbe45d79873..b26dd484942 100644 --- a/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py @@ -27,7 +27,10 @@ from torch import nn from transformers.activations import ACT2FN -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE, KVCache +from text_generation_server.layers.attention import ( + KVCache, + get_kv_scales, +) from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.layers.attention import ( @@ -156,7 +159,10 @@ def __init__( device=weights.device, ) - self.softmax_scale = self.head_size**-0.5 + # `config.attention_multiplier` is used in Granite + self.softmax_scale = getattr( + config, "attention_multiplier", self.head_size**-0.5 + ) if self.num_heads % weights.process_group.size() != 0: raise ValueError( @@ -176,11 +182,13 @@ def __init__( self.query_key_value = load_attention(config, prefix, weights, index) self.index = index + self.kv_scales = get_kv_scales(weights, f"{prefix}") + o_proj = TensorParallelRowLinear.load( config, prefix=f"{prefix}.o_proj", weights=weights, - bias=False, + bias=getattr(config, "attention_bias", False), ) self.o_proj = TensorParallelAdapterRowLinear.load( @@ -221,30 +229,37 @@ def forward( self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin) - kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots) + kv_cache.store( + key=kv[:, 0], + value=kv[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv[:, 0], + value=kv[:, 1], + kv_scales=self.kv_scales, + kv_cache=kv_cache, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj( @@ -436,6 +451,11 @@ def __init__(self, index, prefix, config, weights): eps=config.rms_norm_eps, ) + # Used in Granite + # This could eventually be baked into the weights like we do for the embeddings/lm_head + # but this would mean modifying the lora code + self.residual_multiplier = getattr(config, "residual_multiplier", None) + def forward( self, hidden_states, @@ -466,13 +486,16 @@ def forward( max_s, adapter_data, ) + if self.residual_multiplier is not None: + attn_output *= self.residual_multiplier - # faster post attention rms norm normed_attn_res_output, attn_res = self.post_attention_layernorm( attn_output, res ) mlp_output = self.dense(normed_attn_res_output, adapter_data) + if self.residual_multiplier is not None: + mlp_output *= self.residual_multiplier return mlp_output, attn_res @@ -624,6 +647,11 @@ def __init__(self, prefix: str, config, weights): else: suffix = "lm_head" + # Used in Granite + embedding_multiplier = getattr(config, "embedding_multiplier", None) + if embedding_multiplier is not None: + self.embed_tokens.weight.data *= embedding_multiplier + with no_fp8(weights): self.lm_head = SpeculativeHead.load( config, @@ -631,6 +659,16 @@ def __init__(self, prefix: str, config, weights): weights=weights, ) + # Used in Granite + self.logits_scaling = getattr(config, "logits_scaling", None) + if self.logits_scaling is not None and self.lm_head.head is not None: + try: + # Scale the weights directly + self.lm_head.head.linear.weight.data /= self.logits_scaling + self.logits_scaled = True + except Exception: + self.logits_scaled = False + def forward( self, input_ids: torch.Tensor, @@ -664,4 +702,11 @@ def forward( if lm_head_indices is not None: hidden_states = hidden_states[lm_head_indices] logits, speculative_logits = self.lm_head(hidden_states) + + # Used in Granite + if self.logits_scaling is not None and not self.logits_scaled: + logits /= self.logits_scaling + if speculative_logits is not None: + speculative_logits /= self.logits_scaling + return logits, speculative_logits diff --git a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py index 8974035eafc..c66c732f21d 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py @@ -26,6 +26,7 @@ from transformers.configuration_utils import PretrainedConfig from typing import Optional, List, Tuple +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.layers.attention import ( paged_attention, @@ -40,7 +41,6 @@ TensorParallelMultiAdapterLinear, TensorParallelAdapterRowLinear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE from text_generation_server.layers.rotary import PositionRotaryEmbedding from text_generation_server.layers.layernorm import ( FastRMSNorm, @@ -159,6 +159,7 @@ def __init__(self, prefix: str, config, weights, layer_id): ], process_group=weights.process_group, ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") o_proj = TensorParallelRowLinear.load( config, @@ -209,31 +210,38 @@ def forward( else: kv_to_cache = kv - kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots) + kv_cache.store( + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, window_size_left=self.max_past, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj( diff --git a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py index e7bc83208b0..a45dd1e615e 100644 --- a/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_mixtral_modeling.py @@ -38,7 +38,7 @@ attention, paged_attention, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.layernorm import FastRMSNorm from text_generation_server.layers.moe import DenseMoELayer, MoELayer, SparseMoELayer from text_generation_server.layers.rotary import PositionRotaryEmbedding @@ -214,6 +214,7 @@ def __init__( ) self.query_key_value = load_attention(config, prefix, weights) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.o_proj = TensorParallelRowLinear.load( config, @@ -257,31 +258,38 @@ def forward( else: kv_to_cache = kv - kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots) + kv_cache.store( + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, window_size_left=self.max_past, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py index bcbea442683..2301b63cff2 100644 --- a/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_neox_modeling.py @@ -38,7 +38,7 @@ SpeculativeHead, get_linear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.layernorm import ( FastLayerNorm, ) @@ -131,6 +131,7 @@ def __init__(self, config, prefix, weights): head_size=self.head_size, hidden_size=self.hidden_size, ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.dense = load_row( config, prefix=f"{prefix}.dense", weights=weights, bias=True ) @@ -164,30 +165,37 @@ def forward( qkv[:, 0] = torch.cat((query_rot, query_pass), dim=-1) qkv[:, 1] = torch.cat((key_rot, key_pass), dim=-1) - kv_cache.store(key=qkv[:, 1], value=qkv[:, 2], slots=slots) + kv_cache.store( + key=qkv[:, 1], + value=qkv[:, 2], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - qkv[:, 0], - kv_cache.key if PREFILL_IN_KV_CACHE else qkv[:, 1], - kv_cache.value if PREFILL_IN_KV_CACHE else qkv[:, 2], - seqlen, - block_tables, - self.softmax_scale, + query=qkv[:, 0], + key=qkv[:, 1], + value=qkv[:, 2], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( qkv[:, 0], - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.dense(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py index cb7b6ee2e18..7382a7cb9fc 100644 --- a/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_phi_modeling.py @@ -18,7 +18,7 @@ SpeculativeHead, get_linear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.layernorm import ( FastLayerNorm, ) @@ -138,6 +138,7 @@ def __init__( ) self.query_key_value = load_attention(config, prefix, weights) + self.kv_scales = get_kv_scales(weights, f"{prefix}") # in llama the dense layer is called "o_proj" and has bias=False self.dense = TensorParallelRowLinear.load( @@ -187,29 +188,36 @@ def forward( ) # Reshape key and value and cache - kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots) + kv_cache.store( + key=kv[:, 0], + value=kv[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv[:, 0], + value=kv[:, 1], + kv_scales=self.kv_scales, + kv_cache=kv_cache, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.dense(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py index 8185885fe7d..ab2a177db6a 100644 --- a/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_qwen2_modeling.py @@ -16,7 +16,7 @@ TensorParallelEmbedding, SpeculativeHead, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.rotary import PositionRotaryEmbedding from text_generation_server.layers.layernorm import ( FastRMSNorm, @@ -85,6 +85,8 @@ def __init__( self.query_key_value = load_attention(config, prefix, weights) + self.kv_scales = get_kv_scales(weights, f"{prefix}") + self.o_proj = TensorParallelRowLinear.load( config, prefix=f"{prefix}.o_proj", @@ -127,31 +129,38 @@ def forward( else: kv_to_cache = kv - kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots) + kv_cache.store( + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, window_size_left=self.max_past, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py index dac8ecf9507..2dcd1bf3033 100644 --- a/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_rw_modeling.py @@ -12,7 +12,7 @@ TensorParallelRowLinear, get_linear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.layernorm import FastLayerNorm from text_generation_server.layers.rotary import PositionRotaryEmbedding from text_generation_server.layers.attention import ( @@ -159,6 +159,7 @@ def __init__( weights=weights, bias=config.bias, ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.dense = load_row( config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias ) @@ -199,30 +200,37 @@ def forward( # Inplace rotary self.rotary_emb(query, torch.select(kv, dim=1, index=0), cos, sin) - kv_cache.store(key=kv[:, 0], value=kv[:, 1], slots=slots) + kv_cache.store( + key=kv[:, 0], + value=kv[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv[:, 0], + value=kv[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.dense(attn_output.view(-1, self.num_heads * self.head_size)) @@ -277,6 +285,7 @@ def __init__( weights=weights, bias=config.bias, ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.dense = load_row( config, prefix=f"{prefix}.dense", weights=weights, bias=config.bias ) @@ -312,31 +321,36 @@ def forward( self.rotary_emb(query, torch.select(kv, dim=2, index=0), cos, sin) kv_cache.store( - key=kv[:, :, 0].contiguous(), value=kv[:, :, 1].contiguous(), slots=slots + key=kv[:, :, 0].contiguous(), + value=kv[:, :, 1].contiguous(), + slots=slots, + kv_scales=self.kv_scales, ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv[:, :, 0].contiguous(), - kv_cache.value if PREFILL_IN_KV_CACHE else kv[:, :, 1].contiguous(), - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv[:, :, 0], + value=kv[:, :, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.dense( diff --git a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py index 5972d436bd3..ed053eb661c 100644 --- a/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_santacoder_modeling.py @@ -17,7 +17,7 @@ TensorParallelEmbedding, get_linear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.gptq import GPTQWeightsLoader from text_generation_server.layers.layernorm import ( FastLayerNorm, @@ -258,6 +258,7 @@ def __init__(self, prefix, config, weights): self.c_proj = load_row( config, prefix=f"{prefix}.c_proj", weights=weights, bias=True ) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.kv_head_mapping = torch.zeros( self.num_heads, dtype=torch.int32, device=weights.device ) @@ -283,30 +284,37 @@ def forward( query = query.view(-1, self.num_heads, self.head_size) key_value = key_value.view(-1, 2, 1, self.head_size) - kv_cache.store(key=key_value[:, 0], value=key_value[:, 1], slots=slots) + kv_cache.store( + key=key_value[:, 0], + value=key_value[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else key_value[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else key_value[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=key_value[:, 0], + value=key_value[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.c_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py index 037238b8a20..c793982d828 100644 --- a/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py +++ b/server/text_generation_server/models/custom_modeling/flash_starcoder2_modeling.py @@ -38,7 +38,7 @@ SpeculativeHead, get_linear, ) -from text_generation_server.layers.attention import PREFILL_IN_KV_CACHE +from text_generation_server.layers.attention.kv_cache import get_kv_scales from text_generation_server.layers.layernorm import ( FastLayerNorm, FastRMSNorm, @@ -189,6 +189,7 @@ def __init__( ) self.query_key_value = load_attention(config, prefix, weights) + self.kv_scales = get_kv_scales(weights, f"{prefix}") self.o_proj = TensorParallelRowLinear.load( config, @@ -232,31 +233,38 @@ def forward( else: kv_to_cache = kv - kv_cache.store(key=kv_to_cache[:, 0], value=kv_to_cache[:, 1], slots=slots) + kv_cache.store( + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + slots=slots, + kv_scales=self.kv_scales, + ) # Prefill if cu_seqlen_prefill is not None: # flash attention attn_output = attention( - query, - kv_cache.key if PREFILL_IN_KV_CACHE else kv_to_cache[:, 0], - kv_cache.value if PREFILL_IN_KV_CACHE else kv_to_cache[:, 1], - seqlen, - block_tables, - self.softmax_scale, + query=query, + key=kv_to_cache[:, 0], + value=kv_to_cache[:, 1], + kv_cache=kv_cache, + kv_scales=self.kv_scales, + seqlen=seqlen, + block_tables=block_tables, + softmax_scale=self.softmax_scale, window_size_left=self.max_past, ) # Decode else: attn_output = paged_attention( query, - kv_cache.key, - kv_cache.value, + kv_cache, self.kv_head_mapping, self.softmax_scale, block_tables, seqlen, max_s, + kv_scales=self.kv_scales, ) return self.o_proj(attn_output.view(-1, self.num_heads * self.head_size)) diff --git a/server/text_generation_server/models/custom_modeling/mllama.py b/server/text_generation_server/models/custom_modeling/mllama.py index 6e091a74fe8..be0a4b5d7c3 100644 --- a/server/text_generation_server/models/custom_modeling/mllama.py +++ b/server/text_generation_server/models/custom_modeling/mllama.py @@ -493,9 +493,14 @@ def forward( aspect_ratio_ids: torch.Tensor, attention_mask: torch.Tensor, ) -> torch.Tensor: - batch_size, num_concurrent_media, num_tiles, num_channels, height, width = ( - pixel_values.shape - ) + ( + batch_size, + num_concurrent_media, + num_tiles, + num_channels, + height, + width, + ) = pixel_values.shape pixel_values = pixel_values.reshape( batch_size * num_concurrent_media * num_tiles, num_channels, height, width diff --git a/server/text_generation_server/models/flash_causal_lm.py b/server/text_generation_server/models/flash_causal_lm.py index 33fe30a87cd..b931671cc0c 100644 --- a/server/text_generation_server/models/flash_causal_lm.py +++ b/server/text_generation_server/models/flash_causal_lm.py @@ -16,7 +16,17 @@ AutoTokenizer, GenerationConfig, ) -from typing import Any, ContextManager, Iterable, Optional, Tuple, List, Type, Dict +from typing import ( + Any, + ContextManager, + Iterable, + Optional, + Tuple, + List, + Type, + Dict, + Union, +) from text_generation_server.adapters import AdapterBatchData, AdapterBatchMetadata from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE @@ -24,6 +34,10 @@ from text_generation_server.utils.import_utils import SYSTEM from text_generation_server.models import Model from text_generation_server.utils.log import log_master +from text_generation_server.utils.prefill_chunking import ( + get_support_chunking, + get_max_prefill_tokens, +) from text_generation_server.utils.tokens import batch_top_tokens from text_generation_server.utils.speculate import get_speculate from text_generation_server.utils import ( @@ -60,7 +74,6 @@ tracer = trace.get_tracer(__name__) - # Will be set in init SLIDING_WINDOW: Optional[int] = None @@ -117,45 +130,48 @@ class FlashCausalLMBatch(Batch): requests_idx_mapping: Dict[int, int] # Decoder values - input_ids: torch.Tensor - position_ids: torch.Tensor + # Can be a list for easy filtering + # If `input_ids` is a list, it needs to be materialized to a tensor first + input_ids: Union[torch.Tensor, List[List[int]]] + # Will be set by `generate_token` and reset after each prefill forward before staying set in decode + position_ids: Optional[torch.Tensor] speculative_ids: Optional[torch.Tensor] - # Flash Attention values - - # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill - cu_seqlen_prefill: Optional[torch.Tensor] - # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers - # as we only keep SLIDING_WINDOW values instead of the whole tensor - prefill_cache_indices: Optional[torch.Tensor] - - # Paged Attention values - # Set when creating the batch - # CPU tensor of length b indicating the start of each sequence in slots - start_slots: torch.Tensor # tensor of indices of the currently used slots, length = \sum_{i=0}^{b} s_i in prefill, length = b in decode - slot_indices: torch.Tensor + # Will be set by `generate_token` and reset after each prefill forward before staying set in decode + slot_indices: Optional[torch.Tensor] # list of length b of list of length s_i // block_size block_tables: List[List[int]] # tensor of size [b, max_total_seqlen // block_size] holding the paged attention block tables for all sequences block_tables_tensor: torch.Tensor # tensor of length \sum_{i=0}^{b} max_s_i holding the paged attention slots for all sequences - slots: torch.Tensor - # size [b], containing the number of blocks that can be retrieved from the cache - prefix_lens: List[int] - prefix_lens_tensor: torch.Tensor + # Will be set by `generate_token` and reset after each prefill forward before staying set in decode + slots: Optional[torch.Tensor] - max_seqlen: int + max_input_length: int + max_current_length: int + + # Whether this batch contains at least one request that is prefilling + prefilling: bool + # Whether each request is prefilling + prefilling_mask: List[bool] # Prefill metadata tensors to efficiently compute logprobs + # tensor of length b containing the cumulative sequence lengths of the sequences in the batch, only used in prefill + cu_seqlen_prefill: Optional[torch.Tensor] + # Prefill cache indices is used to slice into the kv tensor before caching it into the paged attention buffers + # as we only keep SLIDING_WINDOW values instead of the whole tensor + prefill_cache_indices: Optional[torch.Tensor] + # Will be set by `generate_token` and reset after each prefill forward prefill_head_indices: Optional[torch.Tensor] + # Will be set by `generate_token` and reset after each prefill forward prefill_next_token_indices: Optional[torch.tensor] + # Will be set by `generate_token` and reset after each prefill forward prefill_cu_outlens: Optional[List[int]] - - # Prefixes - prefix_ids: List[List[int]] + # Will be set by `generate_token` and reset after each prefill forward + prefill_logprob_tokens: List[Optional[Tokens]] # All tokens all_input_ids: List[List[int]] @@ -163,7 +179,14 @@ class FlashCausalLMBatch(Batch): # Lengths of all generations present in the batch input_lengths: List[int] - input_lengths_tensor: torch.Tensor + # size [b], containing the number of blocks that can be retrieved from the cache + cache_lengths: List[int] + prompt_lengths: List[int] + # Will be set by `generate_token` and reset after each prefill forward before staying set in decode + input_lengths_tensor: Optional[torch.Tensor] + cache_lengths_tensor: Optional[torch.Tensor] + prompt_lengths_tensor: torch.Tensor + prefix_offsets: List[Optional[int]] read_offsets: List[Optional[int]] @@ -174,7 +197,8 @@ class FlashCausalLMBatch(Batch): top_n_tokens_tensor: torch.Tensor # Adapter metadata for each request - adapter_meta: AdapterBatchMetadata + # Will be set by `generate_token` and reset after each prefill forward before staying set in decode + adapter_meta: Optional[AdapterBatchMetadata] # Number of blocks in this batch num_blocks: int @@ -187,6 +211,11 @@ def to_pb(self) -> generate_pb2.CachedBatch: request_ids=[r.id for r in self.requests], size=len(self), max_tokens=self.num_blocks * BLOCK_SIZE, + current_tokens=( + sum([len(i) for i in self.input_ids]) + if isinstance(self.input_ids, list) + else len(self.input_ids) + ), ) @classmethod @@ -218,46 +247,28 @@ def from_tokenized( dtype: torch.dtype, device: torch.device, ) -> "FlashCausalLMBatch": - sliding_window = get_sliding_windows() - position_ids = [] - cu_seqlen_prefill = [0] - start_slots = [] - slot_indices = [] - prefill_cache_indices = [] + speculate = get_speculate() + cache_lengths = [] input_lengths = [] + prompt_lengths = [] prefix_offsets = [] read_offsets = [] all_input_ids = [] - prefix_ids = [] + all_postfix_ids = [] requests_idx_mapping = {} - all_prefill_logprobs = True - no_prefill_logprobs = True - prefill_head_indices = [] - prefill_next_token_indices = [] - prefill_cu_outlens = [0] - next_token_chooser_parameters = [] stopping_criterias = [] top_n_tokens = [] - adapter_indices_list = [] - adapter_set = set() - - # Cumulative length - cumulative_length = 0 - cumulative_slot_tokens = 0 - prefill_out_cumulative_length = 0 - num_blocks = 0 - max_seqlen = 0 + max_input_length = 0 + max_current_length = 0 max_length = 0 max_blocks = 0 block_tables = [] - slots = [] - prefix_lens = [] # Parse batch for i, (r, tokenized_input) in enumerate( @@ -266,38 +277,47 @@ def from_tokenized( # request id -> idx in list mapping requests_idx_mapping[r.id] = i - orig_input_length = len(tokenized_input) + prompt_length = len(tokenized_input) + prompt_lengths.append(prompt_length) + + cache_length = r.cache_len - prefix_len = r.prefix_len assert ( - prefix_len <= orig_input_length - ), f"Prefix {prefix_len} vs input {orig_input_length}" - if prefix_len == orig_input_length: - assert prefix_len > 0 - prefix_len -= 1 - - # Commented as it's costly. - # log_master(logger.debug, "Tokenized input ids {tokenized_input}") - prefix_ids.append(tokenized_input[:prefix_len]) - tokenized_input = tokenized_input[prefix_len:] - - input_length = len(tokenized_input) + cache_length <= prompt_length + ), f"Prefix {cache_length} vs input {prompt_length}" + if cache_length == prompt_length: + assert False, "unreachable" + + # `chunk_len` is an optional field in the protobuf + # It is only set if the model support chunking + if r.HasField("chunk_len"): + input_length = r.chunk_len + + if cache_length + input_length < prompt_length: + # FIXME: speculate is not supported for context chunking at the moment + assert speculate == 0 + assert get_support_chunking() + assert input_length > 0 + + postfix_ids = tokenized_input[ + cache_length : cache_length + input_length + ] + assert ( + len(postfix_ids) == input_length + ), "Rust and Python tokenizers are not aligned" + else: + # Use all the remaining ids + postfix_ids = tokenized_input[cache_length:] + input_length = len(postfix_ids) + input_lengths.append(input_length) - prefix_offsets.append(input_length - 5) - read_offsets.append(input_length) + prefix_offsets.append(prompt_length - 5) + read_offsets.append(prompt_length) + all_postfix_ids.append(postfix_ids) all_input_ids.append(tokenized_input) - # Position ids - request_position_ids = torch.arange( - prefix_len, orig_input_length, dtype=torch.int32 - ) - position_ids.append(request_position_ids) - - # Add cumulative lengths of all previous inputs - cu_seqlen_prefill.append(cumulative_length + input_length) - next_token_chooser_parameters.append(r.parameters) stopping_criteria = StoppingCriteria.from_pb( @@ -307,22 +327,13 @@ def from_tokenized( stopping_criterias.append(stopping_criteria) top_n_tokens.append(r.top_n_tokens) - ADAPTER_TO_INDEX = get_adapter_to_index() - adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0) - adapter_indices_list.append(torch.full((input_length,), adapter_index)) - adapter_set.add(adapter_index) - # Paged attention # Remove one as the first token des not have a past speculative_length = get_speculate() speculative_length = 0 if speculative_length is None else speculative_length # Tokens that need to be mapped to blocks. - block_tokens = orig_input_length + max_new_tokens - 1 + speculative_length - - # Tokens that need to be mapped to slots. We don't need slots for the - # cached prefix (if present). - slot_tokens = input_length + max_new_tokens - 1 + speculative_length + block_tokens = prompt_length + max_new_tokens - 1 + speculative_length # blocks and slots can be empty (for example in warmup) if not r.blocks: @@ -330,77 +341,26 @@ def from_tokenized( request_blocks = [ b for b in range(num_blocks, num_blocks + needed_blocks) ] - request_slots = [ - s - for b in request_blocks - for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE) - ] else: request_blocks = r.blocks - request_slots = r.slots[ - prefix_len: #: orig_input_length + max_new_tokens + speculative_length - ] block_tables.append(request_blocks) - slots.extend(request_slots) - prefix_lens.append(prefix_len) + cache_lengths.append(cache_length) num_blocks += len(request_blocks) - start_slots.append(cumulative_slot_tokens) - - request_slot_indices = torch.arange( - cumulative_slot_tokens, - cumulative_slot_tokens + input_length, - dtype=torch.int64, - ) - slot_indices.append(request_slot_indices) - - # Create tensor to slice into the kv tensor in prefill - if sliding_window is not None: - request_prefill_cache_indices = torch.arange( - cumulative_length + max(0, input_length - sliding_window), - cumulative_length + input_length, - dtype=torch.int64, - ) - prefill_cache_indices.append(request_prefill_cache_indices) - - all_prefill_logprobs = all_prefill_logprobs and r.prefill_logprobs - no_prefill_logprobs = no_prefill_logprobs and not r.prefill_logprobs - - if r.prefill_logprobs: - prefill_head_indices.append(request_position_ids + cumulative_length) - prefill_next_token_indices.append( - prefill_out_cumulative_length + input_length - 1 - ) - prefill_cu_outlens.append(prefill_out_cumulative_length + input_length) - prefill_out_cumulative_length += input_length - else: - prefill_head_indices.append( - torch.tensor( - [cumulative_length + input_length - 1], dtype=torch.int32 - ) - ) - prefill_next_token_indices.append(prefill_out_cumulative_length) - prefill_cu_outlens.append(prefill_out_cumulative_length + 1) - prefill_out_cumulative_length += 1 # Update - cumulative_length += input_length - cumulative_slot_tokens += slot_tokens - max_seqlen = max(max_seqlen, input_length) max_blocks = max(max_blocks, len(request_blocks)) + max_input_length = max(max_input_length, input_length) + max_current_length = max(max_current_length, cache_length + input_length) max_length = max( - max_length, input_length + max_new_tokens + speculative_length + max_length, + prompt_length + max_new_tokens + speculative_length, ) - adapter_indices = torch.cat(adapter_indices_list).to( - dtype=torch.int64, device=device - ) - next_token_chooser = HeterogeneousNextTokenChooser.from_pb( next_token_chooser_parameters, dtype, device, tokenizer ) - start_slots = torch.tensor(start_slots, dtype=torch.int64) # Padded all_input_ids_tensor all_input_ids_tensor = np.zeros( @@ -414,103 +374,59 @@ def from_tokenized( all_input_ids_tensor, dtype=torch.int64, device=device ) - if len(pb.requests) > 1: - input_ids = np.concatenate(all_input_ids, dtype=np.int64) - position_ids = torch.cat(position_ids) - slot_indices = torch.cat(slot_indices) - if sliding_window is not None: - prefill_cache_indices = torch.cat(prefill_cache_indices) - else: - input_ids = all_input_ids[0] - position_ids = position_ids[0] - slot_indices = slot_indices[0] - if sliding_window is not None: - prefill_cache_indices = prefill_cache_indices[0] - - cu_seqlen_prefill = torch.tensor( - cu_seqlen_prefill, device=device, dtype=torch.int32 - ) - position_ids = position_ids.to(device) - slot_indices = slot_indices.to(device) - prefill_cache_indices = ( - prefill_cache_indices.to(device) if sliding_window is not None else None - ) - input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device) - input_lengths_tensor = torch.tensor( - input_lengths, dtype=torch.int32, device=device - ) - - adapter_segments, adapter_segment_indices = find_segments(adapter_indices) - adapter_segments = torch.tensor( - adapter_segments, dtype=torch.int32, device=device - ) - - if all_prefill_logprobs: - prefill_head_indices = None - prefill_next_token_indices = cu_seqlen_prefill[1:] - 1 - elif no_prefill_logprobs: - prefill_head_indices = cu_seqlen_prefill[1:] - 1 - prefill_next_token_indices = None - else: - prefill_head_indices = torch.tensor( - torch.cat(prefill_head_indices), dtype=torch.int64, device=device - ) - prefill_next_token_indices = torch.tensor( - prefill_next_token_indices, dtype=torch.int64, device=device - ) top_n_tokens_tensor = torch.tensor( top_n_tokens, device=device, dtype=torch.int64 ) - slots = torch.tensor(slots, dtype=torch.int64, device=device) - block_tables_tensor = torch.zeros( (len(block_tables), max_blocks), dtype=torch.int32, device="cpu" ) for i, request_blocks in enumerate(block_tables): block_tables_tensor[i, : len(request_blocks)] = torch.tensor(request_blocks) block_tables_tensor = block_tables_tensor.to(device) - prefix_lens_tensor = torch.tensor(prefix_lens, dtype=torch.int32, device=device) + prompt_lengths_tensor = torch.tensor( + prompt_lengths, dtype=torch.int32, device=device + ) return cls( batch_id=pb.id, requests=pb.requests, requests_idx_mapping=requests_idx_mapping, - input_ids=input_ids, - position_ids=position_ids, - cu_seqlen_prefill=cu_seqlen_prefill, - prefill_cache_indices=prefill_cache_indices, - start_slots=start_slots, - slot_indices=slot_indices, + input_ids=all_postfix_ids, block_tables=block_tables, block_tables_tensor=block_tables_tensor, - slots=slots, - prefix_lens=prefix_lens, - prefix_lens_tensor=prefix_lens_tensor, - max_seqlen=max_seqlen, - prefill_head_indices=prefill_head_indices, - prefill_next_token_indices=prefill_next_token_indices, - prefill_cu_outlens=prefill_cu_outlens, + cache_lengths=cache_lengths, + max_input_length=max_input_length, + max_current_length=max_current_length, + prefilling=True, + prefilling_mask=[True] * len(pb.requests), + prefill_logprob_tokens=[None] * len(pb.requests), input_lengths=input_lengths, - input_lengths_tensor=input_lengths_tensor, + prompt_lengths=prompt_lengths, prefix_offsets=prefix_offsets, read_offsets=read_offsets, all_input_ids=all_input_ids, all_input_ids_tensor=all_input_ids_tensor, - prefix_ids=prefix_ids, next_token_chooser=next_token_chooser, stopping_criterias=stopping_criterias, top_n_tokens=top_n_tokens, top_n_tokens_tensor=top_n_tokens_tensor, num_blocks=num_blocks, max_blocks=max_blocks, - adapter_meta=AdapterBatchMetadata( - adapter_indices=adapter_indices, - adapter_set=adapter_set, - adapter_segments=adapter_segments, - segment_indices=adapter_segment_indices, - ), speculative_ids=None, + prompt_lengths_tensor=prompt_lengths_tensor, + # These values will be set by `FlashCausalLMBatch.prepare_for_prefill` + position_ids=None, + cu_seqlen_prefill=None, + prefill_cache_indices=None, + slot_indices=None, + slots=None, + prefill_head_indices=None, + prefill_next_token_indices=None, + prefill_cu_outlens=None, + cache_lengths_tensor=None, + input_lengths_tensor=None, + adapter_meta=None, ) @classmethod @@ -533,7 +449,7 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": if len(request_ids) == len(self): return self - device = self.input_ids.device + device = self.block_tables_tensor.device # New values after filtering requests_idx_mapping = {} @@ -548,19 +464,23 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": # Create on CPU to only move to GPU once instead of at every copy slot_indices = torch.empty(len(request_ids), dtype=torch.int64) - max_seqlen = 0 + max_input_length = 0 + max_current_length = 0 requests = [] - start_slots = [] block_tables = [] all_input_ids = [] - prefix_ids = [] + input_ids = [] + prompt_lengths = [] input_lengths = [] - prefix_lens = [] + cache_lengths = [] prefix_offsets = [] read_offsets = [] + prefilling_mask = [] + prefill_logprob_tokens = [] + stopping_criterias = [] top_n_tokens = [] adapter_set = set() @@ -577,16 +497,23 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": requests.append(self.requests[idx]) + # Prefilling + request_prefilling = self.prefilling_mask[idx] + prefilling_mask.append(request_prefilling) + # Get length request_input_length = self.input_lengths[idx] - prefix_len = self.prefix_lens[idx] - max_seqlen = max(max_seqlen, request_input_length) + request_cache_length = self.cache_lengths[idx] + max_input_length = max(max_input_length, request_input_length) + max_current_length = max( + max_current_length, request_cache_length + request_input_length + ) all_input_ids.append(self.all_input_ids[idx]) - prefix_ids.append(self.prefix_ids[idx]) + prompt_lengths.append(self.prompt_lengths[idx]) input_lengths.append(request_input_length) - prefix_lens.append(prefix_len) + cache_lengths.append(request_cache_length) prefix_offsets.append(self.prefix_offsets[idx]) read_offsets.append(self.read_offsets[idx]) @@ -594,60 +521,79 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": stopping_criterias.append(stopping_criteria) top_n_tokens.append(self.top_n_tokens[idx]) + prefill_logprob_tokens.append(self.prefill_logprob_tokens[idx]) ADAPTER_TO_INDEX = get_adapter_to_index() adapter_index = ADAPTER_TO_INDEX.get(self.requests[idx].adapter_id, 0) adapter_set.add(adapter_index) - remaining_tokens = ( - stopping_criteria.max_new_tokens - stopping_criteria.current_tokens - ) - request_block_table = self.block_tables[idx] num_blocks += len(request_block_table) block_tables.append(request_block_table) - start_slots.append(cumulative_max_length) - # Copy to tensor (CPU) - slot_indices[i] = cumulative_max_length + request_input_length - 1 + # Input ids if the request was part of a prefilling batch + # If the batch was decoding we can index into the tensor directly later + if self.prefilling: + input_ids.append(self.input_ids[idx]) + else: + # Copy to tensor (CPU) + slot_indices[i] = cumulative_max_length + + remaining_tokens = ( + stopping_criteria.max_new_tokens - stopping_criteria.current_tokens + ) - # Set slice - slot_filtering_indices[ - self.start_slots[idx] : self.start_slots[idx] - + request_input_length - + remaining_tokens - - 1 - ] = True + # Set slice + slot_filtering_indices[ + self.slot_indices[idx] : self.slot_indices[idx] + + request_input_length + + remaining_tokens + - 1 + ] = True - cumulative_max_length += request_input_length + remaining_tokens - 1 + cumulative_max_length += request_input_length + remaining_tokens - 1 max_blocks = max(max_blocks, len(request_block_table)) - # Index into tensors - input_ids = self.input_ids[indices] - position_ids = self.position_ids[indices] - adapter_indices = self.adapter_meta.adapter_indices[indices] all_input_ids_tensor = self.all_input_ids_tensor[indices] block_tables_tensor = self.block_tables_tensor[indices] - input_lengths_tensor = self.input_lengths_tensor[indices] - slots = self.slots[slot_filtering_indices] - prefix_lens_tensor = self.prefix_lens_tensor[indices] next_token_chooser = self.next_token_chooser.filter(indices) top_n_tokens_tensor = self.top_n_tokens_tensor[indices] speculative_ids = ( self.speculative_ids[indices] if self.speculative_ids is not None else None ) - - start_slots = torch.tensor(start_slots, dtype=torch.int64) - - # Move to GPU now that we have the whole tensor - slot_indices = slot_indices.to(device) - - adapter_segments, adapter_segment_indices = find_segments(adapter_indices) - adapter_segments = torch.tensor( - adapter_segments, dtype=torch.int32, device=device - ) - # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum() + prompt_lengths_tensor = self.prompt_lengths_tensor[indices] + + if self.prefilling: + # These values will be set by `FlashCausalLMBatch.prepare_for_prefill` + position_ids = None + slot_indices = None + slots = None + cache_lengths_tensor = None + input_lengths_tensor = None + adapter_meta = None + else: + # Index into tensors + input_ids = self.input_ids[indices] + position_ids = self.position_ids[indices] + adapter_indices = self.adapter_meta.adapter_indices[indices] + input_lengths_tensor = self.input_lengths_tensor[indices] + slots = self.slots[slot_filtering_indices] + cache_lengths_tensor = self.cache_lengths_tensor[indices] + + # Move to GPU now that we have the whole tensor + slot_indices = slot_indices.to(device) + + adapter_segments, adapter_segment_indices = find_segments(adapter_indices) + adapter_segments = torch.tensor( + adapter_segments, dtype=torch.int32, device=device + ) + adapter_meta = AdapterBatchMetadata( + adapter_indices=adapter_indices, + adapter_set=adapter_set, + adapter_segments=adapter_segments, + segment_indices=adapter_segment_indices, + ) return type(self)( batch_id=self.batch_id, @@ -657,24 +603,28 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": position_ids=position_ids, cu_seqlen_prefill=None, prefill_cache_indices=None, - start_slots=start_slots, slot_indices=slot_indices, block_tables=block_tables, block_tables_tensor=block_tables_tensor, slots=slots, - max_seqlen=max_seqlen, + max_input_length=max_input_length, + max_current_length=max_current_length, + prefilling=self.prefilling, + prefilling_mask=prefilling_mask, prefill_head_indices=None, prefill_next_token_indices=None, prefill_cu_outlens=None, + prefill_logprob_tokens=prefill_logprob_tokens, + prompt_lengths=prompt_lengths, + prompt_lengths_tensor=prompt_lengths_tensor, input_lengths=input_lengths, input_lengths_tensor=input_lengths_tensor, - prefix_lens=prefix_lens, - prefix_lens_tensor=prefix_lens_tensor, + cache_lengths=cache_lengths, + cache_lengths_tensor=cache_lengths_tensor, prefix_offsets=prefix_offsets, read_offsets=read_offsets, all_input_ids=all_input_ids, all_input_ids_tensor=all_input_ids_tensor, - prefix_ids=prefix_ids, next_token_chooser=next_token_chooser, stopping_criterias=stopping_criterias, top_n_tokens=top_n_tokens, @@ -682,12 +632,7 @@ def filter(self, request_ids: List[int]) -> "FlashCausalLMBatch": num_blocks=num_blocks, max_blocks=max_blocks, speculative_ids=speculative_ids, - adapter_meta=AdapterBatchMetadata( - adapter_indices=adapter_indices, - adapter_set=adapter_set, - adapter_segments=adapter_segments, - segment_indices=adapter_segment_indices, - ), + adapter_meta=adapter_meta, ) @classmethod @@ -697,74 +642,98 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch requests = [] requests_idx_mapping = {} + prefilling = False num_blocks = 0 total_batch_size = 0 total_slots = 0 max_blocks = 0 max_length = 0 - max_seqlen = 0 + max_input_length = 0 + max_current_length = 0 for b in batches: total_batch_size += len(b) - total_slots += len(b.slots) + max_blocks = max(max_blocks, b.max_blocks) + # If `b` is prefilling and was just filtered, `b.slots` is None + # `total_slots` is not used if any of the batches is prefilling + total_slots += len(b.slots) if not b.prefilling else 0 num_blocks += b.num_blocks speculative_length = ( b.speculative_ids.shape[1] if b.speculative_ids is not None else 0 ) - max_blocks = max(max_blocks, b.max_blocks) - max_seqlen = max(max_seqlen, b.max_seqlen) + max_input_length = max(max_input_length, b.max_input_length) + max_current_length = max(max_current_length, b.max_current_length) max_length = max( max_length, max( - input_length + prompt_length + stopping_criteria.max_new_tokens + speculative_length - - stopping_criteria.current_tokens - for input_length, stopping_criteria in zip( - b.input_lengths, b.stopping_criterias + for prompt_length, stopping_criteria in zip( + b.prompt_lengths, b.stopping_criterias ) ), ) + prefilling = prefilling or b.prefilling + + if prefilling: + input_ids = [] + # These values will be set by `FlashCausalLMBatch.prepare_for_prefill` + position_ids = None + slots = None + slot_indices = None + cache_lengths_tensor = None + input_lengths_tensor = None + adapter_meta = None + adapter_segment_builder = None + else: + input_ids = batches[0].input_ids.new_empty(total_batch_size) + position_ids = batches[0].position_ids.new_empty(total_batch_size) + slots = batches[0].slots.new_empty(total_slots) + slot_indices = batches[0].slot_indices.new_empty(total_batch_size) + input_lengths_tensor = batches[0].input_lengths_tensor.new_empty( + total_batch_size + ) + cache_lengths_tensor = batches[0].cache_lengths_tensor.new_empty( + total_batch_size + ) + total_indices_size = sum( + b.adapter_meta.adapter_indices.shape[0] for b in batches + ) + adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty( + total_indices_size + ) + adapter_segment_builder = SegmentConcatBuilder() + adapter_set = set() - input_ids = batches[0].input_ids.new_empty(total_batch_size) - position_ids = batches[0].position_ids.new_empty(total_batch_size) - slots = batches[0].slots.new_empty(total_slots) - slot_indices = batches[0].slot_indices.new_empty(total_batch_size) - input_lengths_tensor = batches[0].input_lengths_tensor.new_empty( + prompt_lengths_tensor = batches[0].prompt_lengths_tensor.new_empty( total_batch_size ) block_tables_tensor = batches[0].block_tables_tensor.new_zeros( (total_batch_size, max_blocks) ) - prefix_lens_tensor = batches[0].prefix_lens_tensor.new_empty(total_batch_size) all_input_ids_tensor = batches[0].all_input_ids_tensor.new_zeros( (total_batch_size, max_length) ) top_n_tokens_tensor = batches[0].top_n_tokens_tensor.new_zeros( total_batch_size, ) - total_indices_size = sum( - b.adapter_meta.adapter_indices.shape[0] for b in batches - ) - adapter_indices = batches[0].adapter_meta.adapter_indices.new_empty( - total_indices_size - ) - adapter_set = set() - adapter_segment_builder = SegmentConcatBuilder() - start_slots = [] block_tables = [] - prefix_lens = [] + cache_lengths = [] all_input_ids = [] - prefix_ids = [] + prompt_lengths = [] input_lengths = [] prefix_offsets = [] read_offsets = [] + prefill_logprob_tokens = [] + next_token_chooser_parameters = [] fsm_grammar_states = [] stopping_criterias = [] top_n_tokens = [] + prefilling_mask = [] # Cumulative length cumulative_batch_size = 0 @@ -783,32 +752,9 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch start_index = cumulative_batch_size end_index = cumulative_batch_size + len(batch) - slots_start_index = cumulative_slots - slots_end_index = cumulative_slots + len(batch.slots) # Copy tensors (GPU) - input_ids[start_index:end_index] = batch.input_ids - position_ids[start_index:end_index] = batch.position_ids - slot_indices[start_index:end_index] = batch.slot_indices + cumulative_slots - input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor top_n_tokens_tensor[start_index:end_index] = batch.top_n_tokens_tensor - slots[slots_start_index:slots_end_index] = batch.slots - - # Copy over adapter indices - adapter_start_index = cumulative_adapter_indices_size - adapter_end_index = ( - cumulative_adapter_indices_size - + batch.adapter_meta.adapter_indices.shape[0] - ) - adapter_indices[adapter_start_index:adapter_end_index] = ( - batch.adapter_meta.adapter_indices - ) - cumulative_adapter_indices_size = adapter_end_index - adapter_set.update(batch.adapter_meta.adapter_set) - adapter_segment_builder.concat( - batch.adapter_meta.adapter_segments, batch.adapter_meta.segment_indices - ) - all_input_ids_tensor[ start_index:end_index, : batch.all_input_ids_tensor.shape[1] ] = batch.all_input_ids_tensor[:, :max_length] @@ -816,20 +762,56 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch block_tables_tensor[ start_index:end_index, : batch.block_tables_tensor.shape[1] ] = batch.block_tables_tensor[:, :max_blocks] + prompt_lengths_tensor[start_index:end_index] = batch.prompt_lengths_tensor - prefix_lens_tensor[start_index:end_index] = batch.prefix_lens_tensor + if not prefilling: + slots_start_index = cumulative_slots + slots_end_index = cumulative_slots + len(batch.slots) - start_slots.append(batch.start_slots + cumulative_slots) + input_ids[start_index:end_index] = batch.input_ids + position_ids[start_index:end_index] = batch.position_ids + slots[slots_start_index:slots_end_index] = batch.slots + slot_indices[start_index:end_index] = ( + batch.slot_indices + cumulative_slots + ) + input_lengths_tensor[start_index:end_index] = batch.input_lengths_tensor + cache_lengths_tensor[start_index:end_index] = batch.cache_lengths_tensor + + # Copy over adapter indices + adapter_start_index = cumulative_adapter_indices_size + adapter_end_index = ( + cumulative_adapter_indices_size + + batch.adapter_meta.adapter_indices.shape[0] + ) + adapter_indices[adapter_start_index:adapter_end_index] = ( + batch.adapter_meta.adapter_indices + ) + cumulative_adapter_indices_size = adapter_end_index + adapter_set.update(batch.adapter_meta.adapter_set) + adapter_segment_builder.concat( + batch.adapter_meta.adapter_segments, + batch.adapter_meta.segment_indices, + ) + # Update + cumulative_slots += len(batch.slots) + else: + if isinstance(batch.input_ids, torch.Tensor): + batch.input_ids = batch.input_ids.view(-1, 1).tolist() + input_ids.extend(batch.input_ids) + + prefilling_mask.extend(batch.prefilling_mask) block_tables.extend(batch.block_tables) - prefix_lens.extend(batch.prefix_lens) + cache_lengths.extend(batch.cache_lengths) all_input_ids.extend(batch.all_input_ids) - prefix_ids.extend(batch.prefix_ids) + prompt_lengths.extend(batch.prompt_lengths) input_lengths.extend(batch.input_lengths) prefix_offsets.extend(batch.prefix_offsets) read_offsets.extend(batch.read_offsets) + prefill_logprob_tokens.extend(batch.prefill_logprob_tokens) + next_token_chooser_parameters.extend([r.parameters for r in batch.requests]) fsm_grammar_states.extend(batch.next_token_chooser.fsm_grammar_states) stopping_criterias.extend(batch.stopping_criterias) @@ -838,11 +820,6 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch # Update cumulative_batch_size += len(batch) - cumulative_slots += len(batch.slots) - - start_slots = torch.concat(start_slots) - - # assert sum(len(b) for b in block_tables) == (block_tables_tensor != 0).sum() next_token_chooser = HeterogeneousNextTokenChooser.from_pb( next_token_chooser_parameters, @@ -858,7 +835,14 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch else None ) - adapter_segments, adapter_segment_indices = adapter_segment_builder.build() + if adapter_segment_builder is not None: + adapter_segments, adapter_segment_indices = adapter_segment_builder.build() + adapter_meta = AdapterBatchMetadata( + adapter_indices=adapter_indices, + adapter_set=adapter_set, + adapter_segments=adapter_segments, + segment_indices=adapter_segment_indices, + ) return cls( batch_id=batches[0].batch_id, @@ -868,24 +852,28 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch position_ids=position_ids, cu_seqlen_prefill=None, prefill_cache_indices=None, - start_slots=start_slots, slot_indices=slot_indices, block_tables=block_tables, block_tables_tensor=block_tables_tensor, - prefix_lens=prefix_lens, - prefix_lens_tensor=prefix_lens_tensor, + cache_lengths=cache_lengths, + cache_lengths_tensor=cache_lengths_tensor, slots=slots, - max_seqlen=max_seqlen, + max_input_length=max_input_length, + max_current_length=max_current_length, + prefilling=prefilling, + prefilling_mask=prefilling_mask, prefill_head_indices=None, prefill_next_token_indices=None, prefill_cu_outlens=None, + prefill_logprob_tokens=prefill_logprob_tokens, + prompt_lengths=prompt_lengths, + prompt_lengths_tensor=prompt_lengths_tensor, input_lengths=input_lengths, input_lengths_tensor=input_lengths_tensor, prefix_offsets=prefix_offsets, read_offsets=read_offsets, all_input_ids=all_input_ids, all_input_ids_tensor=all_input_ids_tensor, - prefix_ids=prefix_ids, next_token_chooser=next_token_chooser, stopping_criterias=stopping_criterias, top_n_tokens=top_n_tokens, @@ -893,12 +881,195 @@ def concatenate(cls, batches: List["FlashCausalLMBatch"]) -> "FlashCausalLMBatch num_blocks=num_blocks, max_blocks=max_blocks, speculative_ids=speculative_ids, - adapter_meta=AdapterBatchMetadata( - adapter_indices=adapter_indices, - adapter_set=adapter_set, - adapter_segments=adapter_segments, - segment_indices=adapter_segment_indices, - ), + adapter_meta=adapter_meta, + ) + + def prepare_for_prefill(self): + # Prepare values if we need to continue prefilling + # Speculation must be ignored while we prefill even with chunking + # it simplifies everything + assert self.speculative_ids is None + + sliding_window = get_sliding_windows() + position_ids = [] + cu_seqlen_prefill = [0] + slot_indices = [] + prefill_cache_indices = [] + all_prefill_logprobs = True + no_prefill_logprobs = True + prefill_head_indices = [] + prefill_next_token_indices = [] + prefill_cu_outlens = [0] + + # Cumulative length + cumulative_length = 0 + cumulative_slot_tokens = 0 + prefill_out_cumulative_length = 0 + + slots = [] + adapter_indices_list = [] + adapter_set = set() + + for i, ( + r, + cache_length, + input_length, + prompt_length, + request_prefilling, + blocks, + ) in enumerate( + zip( + self.requests, + self.cache_lengths, + self.input_lengths, + self.prompt_lengths, + self.prefilling_mask, + self.block_tables, + ) + ): + next_chunk_length = input_length + # Position ids + request_position_ids = torch.arange( + cache_length, cache_length + input_length, dtype=torch.int32 + ) + position_ids.append(request_position_ids) + + # Add cumulative lengths of all previous inputs + cu_seqlen_prefill.append(cumulative_length + input_length) + + if not r.slots: + request_slots = [ + s + for b in blocks + for s in range(b * BLOCK_SIZE, (b + 1) * BLOCK_SIZE) + ] + else: + request_slots = r.slots + + request_slots = request_slots[cache_length:] + request_slot_indices = torch.arange( + cumulative_slot_tokens, + cumulative_slot_tokens + input_length, + dtype=torch.int64, + ) + + # Create tensor to slice into the kv tensor in prefill + if sliding_window is not None: + request_prefill_cache_indices = torch.arange( + cumulative_length + max(0, input_length - sliding_window), + cumulative_length + input_length, + dtype=torch.int64, + ) + + # Prefill logprobs is ignored if the request is done prefilling + prefill_logprobs = r.prefill_logprobs and request_prefilling + + all_prefill_logprobs = all_prefill_logprobs and prefill_logprobs + no_prefill_logprobs = no_prefill_logprobs and not prefill_logprobs + + if prefill_logprobs: + prefill_head_indices.append( + torch.arange( + cumulative_length, + cumulative_length + input_length, + dtype=torch.int64, + ) + ) + prefill_next_token_indices.append( + prefill_out_cumulative_length + input_length - 1 + ) + prefill_cu_outlens.append(prefill_out_cumulative_length + input_length) + prefill_out_cumulative_length += input_length + else: + prefill_head_indices.append( + torch.tensor( + [cumulative_length + input_length - 1], + dtype=torch.int64, + ) + ) + prefill_next_token_indices.append(prefill_out_cumulative_length) + prefill_cu_outlens.append(prefill_out_cumulative_length + 1) + prefill_out_cumulative_length += 1 + + slots.extend(request_slots) + slot_indices.append(request_slot_indices) + + if sliding_window is not None: + prefill_cache_indices.append(request_prefill_cache_indices) + + ADAPTER_TO_INDEX = get_adapter_to_index() + adapter_index = ADAPTER_TO_INDEX.get(r.adapter_id, 0) + adapter_indices_list.append(torch.full((next_chunk_length,), adapter_index)) + adapter_set.add(adapter_index) + + # Update + cumulative_length += next_chunk_length + cumulative_slot_tokens += len(request_slots) + + device = self.block_tables_tensor.device + + if isinstance(self.input_ids, list): + if len(self) > 1: + input_ids = np.concatenate(self.input_ids, dtype=np.int64) + else: + input_ids = self.input_ids[0] + self.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device) + + if len(self) > 1: + position_ids = torch.cat(position_ids) + slot_indices = torch.cat(slot_indices) + if sliding_window is not None: + prefill_cache_indices = torch.cat(prefill_cache_indices) + else: + position_ids = position_ids[0] + slot_indices = slot_indices[0] + if sliding_window is not None: + prefill_cache_indices = prefill_cache_indices[0] + + self.prefill_cu_outlens = prefill_cu_outlens + cu_seqlen_prefill = torch.tensor( + cu_seqlen_prefill, device=device, dtype=torch.int32 + ) + self.cu_seqlen_prefill = cu_seqlen_prefill + self.position_ids = position_ids.to(device) + self.slot_indices = slot_indices.to(device) + self.prefill_cache_indices = ( + prefill_cache_indices.to(device) if sliding_window is not None else None + ) + self.input_lengths_tensor = torch.tensor( + self.input_lengths, dtype=torch.int32, device=device + ) + + if all_prefill_logprobs: + prefill_head_indices = None + prefill_next_token_indices = cu_seqlen_prefill[1:] - 1 + elif no_prefill_logprobs: + prefill_head_indices = cu_seqlen_prefill[1:] - 1 + prefill_next_token_indices = None + else: + prefill_head_indices = torch.cat(prefill_head_indices).to(device) + prefill_next_token_indices = torch.tensor( + prefill_next_token_indices, dtype=torch.int64, device=device + ) + + self.prefill_head_indices = prefill_head_indices + self.prefill_next_token_indices = prefill_next_token_indices + self.slots = torch.tensor(slots, dtype=torch.int64, device=device) + self.cache_lengths_tensor = torch.tensor( + self.cache_lengths, dtype=torch.int32, device=device + ) + adapter_indices = torch.cat(adapter_indices_list).to( + dtype=torch.int64, device=device + ) + adapter_segments, adapter_segment_indices = find_segments(adapter_indices) + adapter_segments = torch.tensor( + adapter_segments, dtype=torch.int32, device=device + ) + self.adapter_meta = AdapterBatchMetadata( + adapter_indices=adapter_indices, + adapter_set=adapter_set, + adapter_segments=adapter_segments, + segment_indices=adapter_segment_indices, ) def __len__(self): @@ -938,6 +1109,7 @@ def __init__( head_size: Optional[int] = None, skip_special_tokens: bool = True, kv_cache_dtype: Optional[torch.dtype] = None, + support_chunking: bool = True, ): self.quantize = quantize self.process_group, rank, world_size = initialize_torch_distributed() @@ -950,7 +1122,6 @@ def __init__( dtype = default_dtype if dtype is None else dtype else: device = torch.device("cpu") - # Float16 doesn't exist on target. dtype = torch.bfloat16 if dtype is None else dtype init_cpu_threads_env(rank_id=rank, world_size=world_size) else: @@ -1065,6 +1236,7 @@ def __init__( rank=rank, world_size=world_size, sliding_window=config.sliding_window, + support_chunking=support_chunking, ) @property @@ -1101,11 +1273,11 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): position_ids = torch.zeros(bs, dtype=torch.int32, device=self.device) slots = torch.arange(bs, dtype=torch.int64, device=self.device) input_lengths = [max_s] * bs - prefix_lengths = [0] * bs + cache_lengths = [0] * bs input_lengths_tensor = ( torch.ones(bs, dtype=torch.int32, device=self.device) * max_s ) - prefix_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device) + cache_lengths_tensor = torch.zeros(bs, dtype=torch.int32, device=self.device) block_tables = torch.arange( max_bt, dtype=torch.int32, device=self.device ).repeat(bs) @@ -1115,7 +1287,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): block_tables = block_tables_to_ragged( block_tables=block_tables, input_lengths=input_lengths, - prefix_lens=prefix_lengths, + cache_lengths=cache_lengths, ) from text_generation_server.layers.attention.flashinfer import ( create_decode_state_cuda_graphs, @@ -1144,7 +1316,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): "block_tables": block_tables, "slots": slots, "input_lengths": input_lengths_tensor, - "prefix_lengths": prefix_lengths_tensor, + "cache_lengths": cache_lengths_tensor, "state": state, "graph": graph, } @@ -1156,11 +1328,11 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): cu_seqlen_prefill=None, input_lengths_tensor=input_lengths_tensor, state=state, - prefix_lens_tensor=prefix_lengths_tensor, + cache_lengths_tensor=cache_lengths_tensor, ): seqlen = Seqlen( input_lengths=input_lengths_tensor, - prefix_lengths=prefix_lengths_tensor, + cache_lengths=cache_lengths_tensor, cu_seqlen_q=None, max_q=1, max_k=max_s, @@ -1184,7 +1356,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): with torch.cuda.graph(graph, pool=MEM_POOL): seqlen = Seqlen( input_lengths=input_lengths_tensor, - prefix_lengths=prefix_lengths_tensor, + cache_lengths=cache_lengths_tensor, cu_seqlen_q=None, max_q=1, max_k=max_s, @@ -1207,6 +1379,7 @@ def cuda_graph_warmup(self, bs: int, max_s: int, max_bt: int): def warmup(self, batch: FlashCausalLMBatch): # The warmup batch is the biggest batch we could ever receive + self.kv_cache = [] empty_cache() try: @@ -1226,7 +1399,7 @@ def warmup(self, batch: FlashCausalLMBatch): _, batch, _ = self.generate_token(batch) except torch.cuda.OutOfMemoryError as e: raise RuntimeError( - f"Not enough memory to handle {len(batch.input_ids)} prefill tokens. " + f"Not enough memory to handle {batch.to_pb().current_tokens} prefill tokens. " f"You need to decrease `--max-batch-prefill-tokens`" ) from e @@ -1341,14 +1514,16 @@ def tunableop_warmup(self, seqlen: int): # Dummy value, some models (starcoder2) don't accept `None`. input_lengths = torch.ones(seqlen, dtype=torch.int32, device=self.device) - prefix_lens_tensor = torch.zeros(seqlen, dtype=torch.int32, device=self.device) + cache_lengths_tensor = torch.zeros( + seqlen, dtype=torch.int32, device=self.device + ) cu_seqlen_prefill = torch.tensor( [0, seqlen], device=self.device, dtype=torch.int32 ) max_s = seqlen seqlen = Seqlen( input_lengths=input_lengths, - prefix_lengths=prefix_lens_tensor, + cache_lengths=cache_lengths_tensor, cu_seqlen_q=cu_seqlen_prefill, max_q=1, max_k=seqlen, @@ -1380,7 +1555,7 @@ def forward( block_tables = batch.block_tables_tensor slots = batch.slots[batch.slot_indices] input_lengths = batch.input_lengths_tensor - max_s = batch.max_seqlen + max_s = batch.max_current_length lm_head_indices = batch.prefill_head_indices speculative_ids = batch.speculative_ids @@ -1399,8 +1574,8 @@ def forward( input_lengths = ( input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int ).view(-1) - prefix_lens_tensor = ( - batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length) + cache_lengths_tensor = ( + batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length) ).reshape(-1) # Add Copy the block tables for all members @@ -1422,8 +1597,8 @@ def forward( block_tables = batch.block_tables_tensor slots = batch.slots[batch.slot_indices] input_lengths = batch.input_lengths_tensor - prefix_lens_tensor = batch.prefix_lens_tensor - max_s = batch.max_seqlen + cache_lengths_tensor = batch.cache_lengths_tensor + max_s = batch.max_current_length lm_head_indices = batch.prefill_head_indices if cu_seqlen_prefill is None and self.max_past() is not None: @@ -1445,21 +1620,20 @@ def forward( block_tables = block_tables_to_ragged( block_tables=block_tables, input_lengths=batch.input_lengths, - prefix_lens=batch.prefix_lens, + cache_lengths=batch.cache_lengths, ) with self._forward_context( block_tables=block_tables, cu_seqlen_prefill=cu_seqlen_prefill, input_lengths_tensor=input_lengths, - prefix_lens_tensor=prefix_lens_tensor, + cache_lengths_tensor=cache_lengths_tensor, ): - max_k = (input_lengths + prefix_lens_tensor).max().item() seqlen = Seqlen( input_lengths=input_lengths, - prefix_lengths=prefix_lens_tensor, + cache_lengths=cache_lengths_tensor, cu_seqlen_q=cu_seqlen_prefill, - max_q=max_s, - max_k=max_k, + max_q=batch.max_input_length, + max_k=batch.max_current_length, ) logits, speculative_logits = self.model.forward( input_ids=input_ids, @@ -1486,7 +1660,7 @@ def forward( block_tables = block_tables_to_ragged( block_tables=block_tables, input_lengths=batch.input_lengths, - prefix_lens=batch.prefix_lens, + cache_lengths=batch.cache_lengths, ) # assert block_tables.shape[0] >= slots.shape[0] cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables @@ -1501,14 +1675,16 @@ def forward( cuda_graph["slots"][: slots.shape[0]] = slots cuda_graph["input_lengths"].zero_() cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths - cuda_graph["prefix_lengths"].zero_() - cuda_graph["prefix_lengths"][: prefix_lens_tensor.shape[0]] = prefix_lens_tensor + cuda_graph["cache_lengths"].zero_() + cuda_graph["cache_lengths"][ + : cache_lengths_tensor.shape[0] + ] = cache_lengths_tensor with self._forward_context( block_tables=cuda_graph["block_tables"], cu_seqlen_prefill=None, input_lengths_tensor=cuda_graph["input_lengths"], - prefix_lens_tensor=cuda_graph["prefix_lengths"], + cache_lengths_tensor=cuda_graph["cache_lengths"], state=cuda_graph["state"], ): # Replay the graph @@ -1528,7 +1704,10 @@ def generate_token( self, batch: FlashCausalLMBatch ) -> Tuple[List[Generation], Optional[FlashCausalLMBatch], Tuple[int, int]]: start = time.time_ns() - prefill = batch.cu_seqlen_prefill is not None + prefill = batch.prefilling + if prefill: + batch.prepare_for_prefill() + prefill_logprobs = batch.prefill_next_token_indices is not None # Update adapter indices for speculative tokens (if present) @@ -1570,14 +1749,62 @@ def generate_token( if prefill_logprobs else speculative_logits ) - next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty( - len(batch) - ) - + if len(batch) > 1 and prefill_logprobs: + # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs + # When batch == 1, we will just use the batch.input_ids values directly + prefill_tokens_indices = batch.input_ids.new_zeros(len(out)) else: + prefill_logprobs = None next_token_logits = out next_adapter_indices = batch.adapter_meta.adapter_indices + finished_prefilling = True + next_chunk_lengths = [] + current_prefilling_mask = batch.prefilling_mask + if prefill: + if get_support_chunking(): + next_prefilling_mask = [] + # Budget in tokens for the next batch + # We remove (len(batch) - 1) to always have enough space for at least a single decode + # for the remaining requests -1 because the first request does not need to be removed from the budget + # (ex: you have one request in the batch, you want it to take the full budget not budget -1) + batch_budget = get_max_prefill_tokens() - (len(batch) - 1) + # We reverse to prioritize older requests + # zip() is not reversible so reverse the underlying lists instead + for cache_length, input_length, prompt_length in zip( + reversed(batch.cache_lengths), + reversed(batch.input_lengths), + reversed(batch.prompt_lengths), + ): + remaining_prefill_tokens = max( + prompt_length - cache_length - input_length, 0 + ) + if remaining_prefill_tokens > 0: + next_chunk_length = max( + min(remaining_prefill_tokens, batch_budget), 1 + ) + batch_budget -= next_chunk_length + finished_prefilling = False + next_prefilling_mask.append(True) + else: + # FIXME: use true number of accepted tokens instead of 1 + # Since speculation will be turned off, this is always true + next_chunk_length = 1 + next_prefilling_mask.append(False) + next_chunk_lengths.append(next_chunk_length) + + # Reverse back the obtained values² + next_chunk_lengths.reverse() + next_prefilling_mask.reverse() + else: + # The model does not support chunking + # We know we only do a single prefill + finished_prefilling = True + next_prefilling_mask = [False] * len(batch) + + batch.prefilling = not finished_prefilling + batch.prefilling_mask = next_prefilling_mask + speculate = get_speculate() ( next_input_ids, @@ -1586,7 +1813,7 @@ def generate_token( accepted_ids, speculative_ids, ) = batch.next_token_chooser( - batch.all_input_ids_tensor[:, : batch.max_seqlen], + batch.all_input_ids_tensor[:, : batch.max_current_length], next_token_logits, speculate, batch.speculative_ids, @@ -1597,29 +1824,28 @@ def generate_token( batch.top_n_tokens, batch.top_n_tokens_tensor, logprobs, accepted_ids ) - if prefill: - if len(batch) > 1 and prefill_logprobs: - # We create the prefill_tokens_indices tensor that will be used to gather prefill logprobs - # When batch == 1, we will just use the batch.input_ids values directly - prefill_tokens_indices = batch.input_ids.new_zeros(len(out)) - + # Since we are done prefilling, all the tensors that were concatenating values for all the requests + # instantly become of shape [BATCH_SIZE] + if prefill and finished_prefilling: next_position_ids = batch.position_ids.new_empty(len(batch)) batch.slot_indices = batch.slot_indices[batch.cu_seqlen_prefill[1:] - 1] - # We do not need cu_seqlen_prefill anymore - batch.cu_seqlen_prefill = None - else: - prefill_logprobs = None + next_adapter_indices = batch.adapter_meta.adapter_indices.new_empty( + len(batch) + ) + elif not prefill: next_position_ids = batch.position_ids - # Cumulative length - cumulative_length = 0 - - # Results - generations: List[Generation] = [] - stopped = True - # Zipped iterator - iterator = zip(batch.input_lengths, batch.all_input_ids, accepted_ids) + iterator = zip( + batch.requests, + batch.prompt_lengths, + batch.cache_lengths, + batch.input_lengths, + batch.all_input_ids, + accepted_ids, + current_prefilling_mask, + batch.prefilling_mask, + ) # We do two for loops as the first one can run completely asynchronously from the GPU while for the second # one, we need to first do a GPU <-> CPU sync @@ -1627,16 +1853,22 @@ def generate_token( # For each member of the batch index = 0 - for i, (input_length, all_input_ids, n_accepted_ids) in enumerate(iterator): - # Indexing metadata - start_index = cumulative_length - end_index = cumulative_length + input_length - - if prefill: + # Cumulative length + cumulative_length = 0 + for i, ( + request, + prompt_length, + cache_length, + input_length, + all_input_ids, + n_accepted_ids, + request_was_prefilling, + request_is_prefilling, + ) in enumerate(iterator): + if prefill and finished_prefilling: # Indexing metadata - out_start_index = batch.prefill_cu_outlens[i] - out_end_index = batch.prefill_cu_outlens[i + 1] - out_length = out_end_index - out_start_index + _start_index = cumulative_length + end_index = cumulative_length + input_length # Initialize position_ids # In decode, we do not need this as we can just increment position ids @@ -1648,34 +1880,56 @@ def generate_token( end_index - 1 ] - # Used to gather prefill logprobs - # Copy batch.input_ids to prefill_token_indices - if prefill_logprobs: - if len(batch) > 1: - prefill_tokens_indices[out_start_index : out_end_index - 1] = ( - batch.input_ids[start_index + 1 : start_index + out_length] - ) - else: - # Set prefill_tokens_indices to the correct slice - prefill_tokens_indices = batch.input_ids[ - start_index + 1 : start_index + out_length - ] - - for j in range(n_accepted_ids): - batch.all_input_ids_tensor[i, input_length + j] = next_input_ids[index] - index += 1 + # Used to gather prefill logprobs + # Copy batch.all_input_ids_tensor to prefill_token_indices + if request.prefill_logprobs and request_was_prefilling: + # Indexing metadata + out_start_index = batch.prefill_cu_outlens[i] + out_end_index = batch.prefill_cu_outlens[i + 1] + # Logprobs generated by the model are for the next token + # So we need to translate the id tensor by 1 + ids = batch.all_input_ids_tensor[ + i, cache_length + 1 : cache_length + input_length + 1 + ] + if len(batch) > 1: + prefill_tokens_indices[out_start_index:out_end_index] = ids + else: + # Set prefill_tokens_indices to the correct slice + prefill_tokens_indices = ids + + if not request_is_prefilling: + # Only save tokens if we are done prefilling for this request + for j in range(n_accepted_ids): + batch.all_input_ids_tensor[i, cache_length + input_length + j] = ( + next_input_ids[index + j] + ) + index += n_accepted_ids cumulative_length += input_length # Update values - batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1] - batch.speculative_ids = speculative_ids - batch.position_ids = next_position_ids + accepted_ids - batch.input_lengths_tensor += accepted_ids - batch.slot_indices += accepted_ids - batch.adapter_meta.adapter_indices = next_adapter_indices + # These values can be updated without a GPU -> CPU sync + if not prefill or (prefill and finished_prefilling): + batch.input_ids = next_input_ids[accepted_ids.cumsum(dim=-1) - 1] + batch.speculative_ids = speculative_ids + batch.position_ids = next_position_ids + accepted_ids + batch.cache_lengths_tensor += batch.input_lengths_tensor + batch.input_lengths_tensor = accepted_ids.to(dtype=torch.int32) + batch.slot_indices += accepted_ids + batch.adapter_meta.adapter_indices = next_adapter_indices - if prefill: + if prefill and prefill_logprobs: + # Get prefill logprobs with inplace softmax (avoid copying the `out` tensor (max_batch_prefill_tokens * vocab_size)) + torch.log_softmax(out, -1, out=out) + prefill_logprobs_tensor = out + prefill_logprobs = torch.gather( + prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1) + ) + # GPU <-> CPU sync + prefill_logprobs = prefill_logprobs.view(-1).tolist() + + # Does a GPU <-> CPU sync internally + if prefill and finished_prefilling: # adjust segment lengths to account for all request lengths being 1 during decoding adapter_segments, _ = find_segments(batch.adapter_meta.adapter_indices) batch.adapter_meta.adapter_segments = torch.tensor( @@ -1684,192 +1938,282 @@ def generate_token( device=batch.adapter_meta.adapter_segments.device, ) - if prefill and prefill_logprobs: - # Get prefill logprobs - prefill_logprobs_tensor = torch.log_softmax(out, -1) - prefill_logprobs = torch.gather( - prefill_logprobs_tensor, 1, prefill_tokens_indices.view(-1, 1) - ) - # GPU <-> CPU sync - prefill_logprobs = prefill_logprobs.view(-1).tolist() - # GPU <-> CPU sync next_token_logprobs = next_token_logprobs.tolist() next_token_ids = next_input_ids.tolist() accepted_ids = accepted_ids.tolist() + + # Update values if we need to continue prefilling + # This represents the `else` case of the `Update values` if above + # but since this require the `next_token_ids` to be on CPU, it is better to do it here + if prefill and not finished_prefilling: + # Speculation must be ignored while we prefill even with chunking + # it simplifies everything + assert batch.speculative_ids is None + + all_postfix_ids = [] + for i, ( + request_prefilling, + next_token_id, + all_input_ids, + cache_length, + input_length, + next_chunk_length, + ) in enumerate( + zip( + batch.prefilling_mask, + next_token_ids, + batch.all_input_ids, + batch.cache_lengths, + batch.input_lengths, + next_chunk_lengths, + ) + ): + if request_prefilling: + next_cache_length = cache_length + input_length + # Get new prompt IDs to prefill + postfix_ids = all_input_ids[ + next_cache_length : next_cache_length + next_chunk_length + ] + else: + # This request is done prefilling, the new id is the one selected the sampling method + postfix_ids = [next_token_id] + + all_postfix_ids.append(postfix_ids) + + batch.input_ids = all_postfix_ids + start_decode = time.time_ns() + # Results + generations: List[Generation] = [] + stopped = True + # Zipped iterator iterator = zip( batch.requests, + batch.prompt_lengths, + batch.cache_lengths, batch.input_lengths, batch.prefix_offsets, batch.read_offsets, batch.stopping_criterias, batch.all_input_ids, - batch.prefix_ids, batch.next_token_chooser.do_sample, batch.next_token_chooser.seeds, batch.top_n_tokens, + current_prefilling_mask, + batch.prefilling_mask, accepted_ids, batch_top_token_ids, batch_top_token_logprobs, ) + # Reset max_input_length + batch.max_input_length = 0 # For each member of the batch index = 0 for i, ( request, + prompt_length, + cache_length, input_length, prefix_offset, read_offset, stopping_criteria, all_input_ids, - prefix_ids, do_sample, seed, top_n_tokens, + request_was_prefilling, + request_is_prefilling, n_accepted_ids, top_token_ids, top_token_logprobs, ) in enumerate(iterator): - # Append next token to all tokens - next_token_texts = [] - left = 0 - - if n_accepted_ids > 1: - log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}") - - current_stopped = False - for j in range(index, index + n_accepted_ids): - # Generated token - next_token_id = next_token_ids[j] - all_input_ids.append(next_token_id) - next_token_text, prefix_offset, read_offset = self.decode_token( - all_input_ids, - prefix_offset, - read_offset, - ) - next_token_texts.append(next_token_text) - - stop, reason = stopping_criteria( - next_token_id, - next_token_text, - ) - - if stop: - left = index + n_accepted_ids - j - 1 - current_stopped = True - break - else: - current_stopped = False - stopped = stopped and current_stopped - - _next_token_ids = next_token_ids[index : index + n_accepted_ids - left] - _next_token_logprobs = next_token_logprobs[ - index : index + n_accepted_ids - left - ] - index += n_accepted_ids - - # Shard generations - # All generations will be appended in the rust sharded client - if i % self.world_size == self.rank: - if stop: - # Decode generated tokens - output_text, _, _ = self.decode_token( - all_input_ids, - prefix_offset=len(all_input_ids) - - stopping_criteria.current_tokens - - 1, - read_offset=len(all_input_ids) - - stopping_criteria.current_tokens, - skip_special_tokens=True, - ) - generated_text = GeneratedText( - output_text, - stopping_criteria.current_tokens, - reason, - seed if do_sample else None, - ) - else: - generated_text = None - + # Compute logprobs first as, even though we might skip the token, + # it can still be required to compute the logprobs + # modulo on request.id as it is robust to batch.filter whereas the index in the batch is not and we need + # this state to be stable + if request.id % self.world_size == self.rank: # Prefill - if prefill and request.prefill_logprobs: + if request_was_prefilling and request.prefill_logprobs: out_start_index = batch.prefill_cu_outlens[i] out_end_index = batch.prefill_cu_outlens[i + 1] + if not request_is_prefilling: + # The request is dones prefilling, meaning that we started generating new tokens + # The last logprob is a logprob for a generated token that was not part of the prompt + # We need to remove it + out_end_index -= 1 + + request_prefill_logprobs = prefill_logprobs[ + out_start_index:out_end_index + ] + # Logprobs generated by the model are for the next token + # So we need to translate the id tensor by 1 + prefill_token_ids = all_input_ids[ + cache_length + 1 : cache_length + input_length + 1 + ] + + past_prefill_logprob_tokens = batch.prefill_logprob_tokens[i] + + if past_prefill_logprob_tokens is None: + # add nan for cached prompt tokens/first token + request_prefill_logprobs = [float("nan")] * ( + cache_length + 1 + ) + request_prefill_logprobs + prefill_token_ids = ( + all_input_ids[: cache_length + 1] + prefill_token_ids + ) - # Remove generated token to only have prefill and add nan for first prompt token - request_prefill_logprobs = ( - [float("nan")] * (len(prefix_ids) + 1) - ) + prefill_logprobs[out_start_index : out_end_index - 1] - prefill_token_ids = all_input_ids[:-1] prefill_texts = self.tokenizer.batch_decode( - prefix_ids + prefill_token_ids, + prefill_token_ids, clean_up_tokenization_spaces=False, skip_special_tokens=False, ) - prefill_tokens = Tokens( - prefix_ids + prefill_token_ids, + prefill_logprob_tokens = Tokens( + prefill_token_ids, request_prefill_logprobs, prefill_texts, is_special=[], ) + if past_prefill_logprob_tokens is not None: + prefill_logprob_tokens = ( + past_prefill_logprob_tokens + prefill_logprob_tokens + ) + + batch.prefill_logprob_tokens[i] = prefill_logprob_tokens else: - prefill_tokens = None - - if top_n_tokens > 0: - all_top_tokens = [] - for top_token_ids, top_token_logprobs in zip( - top_token_ids, top_token_logprobs - ): - toptoken_texts = self.tokenizer.batch_decode( - top_token_ids, - clean_up_tokenization_spaces=False, - skip_special_tokens=False, + batch.prefill_logprob_tokens[i] = None + + # If it is, the tokens we decoded should be ignored + if request_is_prefilling: + # Make sure that we do not stop as even though this request did not create a token, it is still + # processing + stopped = False + new_input_length = next_chunk_lengths[i] + else: + new_input_length = n_accepted_ids + # Append next token to all tokens + next_token_texts = [] + left = 0 + + if n_accepted_ids > 1: + log_master(logger.debug, f"speculated ids {n_accepted_ids - 1}") + + current_stopped = False + for j in range(index, index + n_accepted_ids): + # Generated token + next_token_id = next_token_ids[j] + all_input_ids.append(next_token_id) + next_token_text, prefix_offset, read_offset = self.decode_token( + all_input_ids, + prefix_offset, + read_offset, + ) + next_token_texts.append(next_token_text) + + stop, reason = stopping_criteria( + next_token_id, + next_token_text, + ) + + if stop: + left = index + n_accepted_ids - j - 1 + current_stopped = True + break + else: + current_stopped = False + stopped = stopped and current_stopped + + _next_token_ids = next_token_ids[index : index + n_accepted_ids - left] + _next_token_logprobs = next_token_logprobs[ + index : index + n_accepted_ids - left + ] + + # Shard generations + # All generations will be appended in the rust sharded client + if request.id % self.world_size == self.rank: + if stop: + # Decode generated tokens + output_text, _, _ = self.decode_token( + all_input_ids, + prefix_offset=len(all_input_ids) + - stopping_criteria.current_tokens + - 1, + read_offset=len(all_input_ids) + - stopping_criteria.current_tokens, + skip_special_tokens=True, ) - special_toptokens = [ - token_id in self.all_special_ids - for token_id in top_token_ids - ] - top_tokens = Tokens( - top_token_ids, - top_token_logprobs, - toptoken_texts, - special_toptokens, + generated_text = GeneratedText( + output_text, + stopping_criteria.current_tokens, + reason, + seed if do_sample else None, ) - all_top_tokens.append(top_tokens) - top_tokens = all_top_tokens - else: - top_tokens = None - - generation = Generation( - request.id, - prefill_tokens, - Tokens( - _next_token_ids, - _next_token_logprobs, - next_token_texts, - [nid in self.all_special_ids for nid in _next_token_ids], - ), - generated_text, - top_tokens, - ) + else: + generated_text = None + + if top_n_tokens > 0: + all_top_tokens = [] + for top_token_ids, top_token_logprobs in zip( + top_token_ids, top_token_logprobs + ): + toptoken_texts = self.tokenizer.batch_decode( + top_token_ids, + clean_up_tokenization_spaces=False, + skip_special_tokens=False, + ) + special_toptokens = [ + token_id in self.all_special_ids + for token_id in top_token_ids + ] + top_tokens = Tokens( + top_token_ids, + top_token_logprobs, + toptoken_texts, + special_toptokens, + ) + all_top_tokens.append(top_tokens) + top_tokens = all_top_tokens + else: + top_tokens = None + + generation = Generation( + request.id, + batch.prefill_logprob_tokens[i], + Tokens( + _next_token_ids, + _next_token_logprobs, + next_token_texts, + [nid in self.all_special_ids for nid in _next_token_ids], + ), + generated_text, + top_tokens, + ) - generations.append(generation) + generations.append(generation) - # accept each new token for this specific request since we may - # have more than one new token per request with speculative decoding - for next_token_id in _next_token_ids: - batch.next_token_chooser = ( - batch.next_token_chooser.advance_grammar_single(i, next_token_id) - ) + # accept each new token for this specific request since we may + # have more than one new token per request with speculative decoding + for next_token_id in _next_token_ids: + batch.next_token_chooser = ( + batch.next_token_chooser.advance_grammar_single( + i, next_token_id + ) + ) # Update values - batch.input_lengths[i] = input_length + n_accepted_ids - if batch.input_lengths[i] > batch.max_seqlen: - batch.max_seqlen = batch.input_lengths[i] + index += n_accepted_ids + current_cache_length = cache_length + input_length + batch.cache_lengths[i] = current_cache_length + current_input_length = new_input_length + batch.max_input_length = max(batch.max_input_length, current_input_length) + batch.input_lengths[i] = current_input_length + current_length = current_cache_length + current_input_length + batch.max_current_length = max(batch.max_current_length, current_length) + batch.prefix_offsets[i] = prefix_offset batch.read_offsets[i] = read_offset batch.all_input_ids[i] = all_input_ids @@ -1880,9 +2224,13 @@ def generate_token( decode_ns = time.time_ns() - start_decode return generations, None, (forward_ns, decode_ns) - batch.prefill_cu_outlens = None - batch.prefill_head_indices = None - batch.prefill_next_token_indices = None + if prefill and finished_prefilling: + # We do not need prefill tensors anymore + batch.cu_seqlen_prefill = None + batch.prefill_cache_indices = None + batch.prefill_cu_outlens = None + batch.prefill_head_indices = None + batch.prefill_next_token_indices = None forward_ns = start_decode - start decode_ns = time.time_ns() - start_decode @@ -1894,7 +2242,7 @@ def _forward_context( block_tables: torch.Tensor, cu_seqlen_prefill: Optional[torch.Tensor], input_lengths_tensor: torch.Tensor, - prefix_lens_tensor: torch.Tensor, + cache_lengths_tensor: torch.Tensor, state: Optional[Any] = None, ) -> ContextManager: if ATTENTION != "flashinfer": @@ -1905,8 +2253,6 @@ def _forward_context( use_prefill_with_paged_kv_state, ) - # has_prefix_lens = any(prefix_len > 0 for prefix_len in prefix_lens) - if cu_seqlen_prefill is not None: return use_prefill_with_paged_kv_state( state=( @@ -1915,11 +2261,11 @@ def _forward_context( # block_tables=block_tables_to_ragged( # block_tables=block_tables, # input_lengths=input_lengths, - # prefix_lens=prefix_lens, + # cache_lengths=cache_lengths, # ), block_tables=block_tables, cu_seqlens=cu_seqlen_prefill, - input_lengths=input_lengths_tensor + prefix_lens_tensor, + input_lengths=input_lengths_tensor + cache_lengths_tensor, num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, head_size=self.head_size, @@ -1931,31 +2277,32 @@ def _forward_context( assert input_lengths_tensor is not None return use_decode_state( state=state if state is not None else self.decode_state, - input_lengths=input_lengths_tensor + prefix_lens_tensor, + input_lengths=input_lengths_tensor + cache_lengths_tensor, block_tables=block_tables, num_heads=self.num_heads, num_kv_heads=self.num_kv_heads, head_size=self.head_size, page_size=BLOCK_SIZE, + kv_cache_dtype=self.kv_cache_dtype, dtype=self.dtype, window_left=self.sliding_window, ) def block_tables_to_ragged( - *, block_tables: torch.Tensor, input_lengths: List[int], prefix_lens: List[int] + *, block_tables: torch.Tensor, input_lengths: List[int], cache_lengths: List[int] ) -> torch.Tensor: """Convert block table to ragged format compatible with FlashInfer.""" - assert len(input_lengths) == len(prefix_lens) + assert len(input_lengths) == len(cache_lengths) - total_len = sum(input_lengths) + sum(prefix_lens) + total_len = sum(input_lengths) + sum(cache_lengths) block_tables_ragged = torch.empty( total_len, dtype=torch.int32, device=block_tables.device ) offset = 0 - for i, (input_length, prefix_len) in enumerate(zip(input_lengths, prefix_lens)): - seq_len = prefix_len + input_length + for i, (input_length, cache_length) in enumerate(zip(input_lengths, cache_lengths)): + seq_len = cache_length + input_length block_tables_ragged[offset : offset + seq_len] = block_tables[i][:seq_len] offset += seq_len diff --git a/server/text_generation_server/models/globals.py b/server/text_generation_server/models/globals.py index 6c518c2caa5..4ac6a6b499f 100644 --- a/server/text_generation_server/models/globals.py +++ b/server/text_generation_server/models/globals.py @@ -5,9 +5,14 @@ from text_generation_server.utils.log import log_master -PREFIX_CACHING = os.getenv("USE_PREFIX_CACHING").lower() in {"1", "true"} +ATTENTION = os.environ["ATTENTION"] +# default_prefix_caching = "1" if ATTENTION in {"flashinfer", "flashdecoding"} else "0" +PREFIX_CACHING = os.environ["PREFIX_CACHING"].lower() in { + "1", + "true", +} +PREFILL_CHUNKING = os.getenv("PREFILL_CHUNKING", "0").lower() in {"1", "true"} log_master(logger.info, f"Using prefix caching = {PREFIX_CACHING}") -ATTENTION = os.getenv("ATTENTION") _expected = {"paged", "flashdecoding", "flashinfer"} assert ( ATTENTION in _expected @@ -18,7 +23,7 @@ raise RuntimeError("Prefix caching is only supported with flashinfer") MEM_POOL = torch.cuda.graph_pool_handle() if torch.cuda.is_available() else None -TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.95")) +TGI_WIGGLE_ROOM = float(os.getenv("TGI_WIGGLE_ROOM", "0.90")) assert TGI_WIGGLE_ROOM > 0 assert TGI_WIGGLE_ROOM < 1 diff --git a/server/text_generation_server/models/idefics_causal_lm.py b/server/text_generation_server/models/idefics_causal_lm.py index 9a7a6fe156a..34b74ba8856 100644 --- a/server/text_generation_server/models/idefics_causal_lm.py +++ b/server/text_generation_server/models/idefics_causal_lm.py @@ -83,6 +83,7 @@ def to_pb(self) -> generate_pb2.CachedBatch: request_ids=[r.id for r in self.requests], size=len(self), max_tokens=self.max_tokens, + current_tokens=len(self), ) @classmethod diff --git a/server/text_generation_server/models/mamba.py b/server/text_generation_server/models/mamba.py index f6dcde68ab6..dfc61fb8875 100644 --- a/server/text_generation_server/models/mamba.py +++ b/server/text_generation_server/models/mamba.py @@ -116,6 +116,7 @@ def to_pb(self) -> generate_pb2.CachedBatch: request_ids=[r.id for r in self.requests], size=len(self), max_tokens=self.max_tokens, + current_tokens=len(self), ) @classmethod diff --git a/server/text_generation_server/models/mllama_causal_lm.py b/server/text_generation_server/models/mllama_causal_lm.py index 9e19e171503..6399f92c14c 100644 --- a/server/text_generation_server/models/mllama_causal_lm.py +++ b/server/text_generation_server/models/mllama_causal_lm.py @@ -1,14 +1,17 @@ -from io import BytesIO -from PIL import Image import torch + +import numpy as np + from typing import Iterable, Optional, Tuple, List, Dict from text_generation_server.pb.generate_pb2 import Request - +from io import BytesIO +from PIL import Image from dataclasses import dataclass from opentelemetry import trace from transformers import ( PreTrainedTokenizerBase, ) + from text_generation_server.models.vlm_causal_lm import VlmCausalLMBatch, VlmCausalLM from text_generation_server.pb import generate_pb2 from text_generation_server.models.flash_causal_lm import ( @@ -167,6 +170,13 @@ def from_pb_processor( batch.all_input_ids_tensor = batch.all_input_ids_tensor.clamp( max=config.text_config.vocab_size - 1 ) + if isinstance(batch.input_ids, list): + if len(batch) > 1: + input_ids = np.concatenate(batch.input_ids, dtype=np.int64) + else: + input_ids = batch.input_ids[0] + batch.input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device) + batch.input_ids = batch.input_ids.clamp(max=config.text_config.vocab_size - 1) if image_inputs is not None: @@ -190,7 +200,7 @@ def from_pb_processor( class MllamaCausalLM(VlmCausalLM): def forward( self, - batch: VlmCausalLMBatch, + batch: MllamaCausalLMBatch, adapter_data: Optional[Dict[str, torch.Tensor]] = None, ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: # Model Forward @@ -202,7 +212,7 @@ def forward( block_tables = batch.block_tables_tensor slots = batch.slots[batch.slot_indices] input_lengths = batch.input_lengths_tensor - max_s = batch.max_seqlen + max_s = batch.max_current_length lm_head_indices = batch.prefill_head_indices speculative_ids = batch.speculative_ids @@ -221,8 +231,8 @@ def forward( input_lengths = ( input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int ).view(-1) - prefix_lens_tensor = ( - batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length) + cache_lengths_tensor = ( + batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length) ).reshape(-1) # Add Copy the block tables for all members @@ -244,8 +254,8 @@ def forward( block_tables = batch.block_tables_tensor slots = batch.slots[batch.slot_indices] input_lengths = batch.input_lengths_tensor - prefix_lens_tensor = batch.prefix_lens_tensor - max_s = batch.max_seqlen + cache_lengths_tensor = batch.cache_lengths_tensor + max_s = batch.max_current_length lm_head_indices = batch.prefill_head_indices if cu_seqlen_prefill is None and self.max_past() is not None: @@ -254,7 +264,6 @@ def forward( # This makes sure the max_s for the decode pass is correct. max_s = min(self.max_past(), max_s) - bs = input_ids.shape[0] # Try to find an associated cuda graph bs = input_ids.shape[0] sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs]) @@ -269,26 +278,24 @@ def forward( # Only run cuda graphs when there's no images. or batch.cross_attention_states is not None ): - input_lengths = input_lengths + prefix_lens_tensor if PREFIX_CACHING: block_tables = block_tables_to_ragged( block_tables=block_tables, input_lengths=batch.input_lengths, - prefix_lens=batch.prefix_lens, + cache_lengths=batch.cache_lengths, ) with self._forward_context( block_tables=block_tables, cu_seqlen_prefill=cu_seqlen_prefill, input_lengths_tensor=input_lengths, - prefix_lens_tensor=prefix_lens_tensor, + cache_lengths_tensor=cache_lengths_tensor, ): - max_k = (input_lengths + prefix_lens_tensor).max().item() seqlen = Seqlen( input_lengths=input_lengths, - prefix_lengths=prefix_lens_tensor, + cache_lengths=cache_lengths_tensor, cu_seqlen_q=cu_seqlen_prefill, - max_q=max_s, - max_k=max_k, + max_q=batch.max_input_length, + max_k=batch.max_current_length, ) if batch.pixel_values is not None: @@ -330,22 +337,34 @@ def forward( block_tables = block_tables_to_ragged( block_tables=block_tables, input_lengths=batch.input_lengths, - prefix_lens=batch.prefix_lens, + cache_lengths=batch.cache_lengths, ) cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables else: cuda_graph["block_tables"][ : block_tables.shape[0], : block_tables.shape[1] ] = block_tables + + # XXX: This is working only because block 0 is reserved for the healthcheck + # so it doesn't matter if we override it with bogus values. cuda_graph["slots"].fill_(0) cuda_graph["slots"][: slots.shape[0]] = slots cuda_graph["input_lengths"].zero_() - cuda_graph["input_lengths"][: input_lengths.shape[0]] = ( - input_lengths + prefix_lens_tensor - ) + cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths + cuda_graph["cache_lengths"].zero_() + cuda_graph["cache_lengths"][ + : cache_lengths_tensor.shape[0] + ] = cache_lengths_tensor - # Replay the graph - cuda_graph["graph"].replay() + with self._forward_context( + block_tables=cuda_graph["block_tables"], + cu_seqlen_prefill=None, + input_lengths_tensor=cuda_graph["input_lengths"], + cache_lengths_tensor=cuda_graph["cache_lengths"], + state=cuda_graph["state"], + ): + # Replay the graph + cuda_graph["graph"].replay() # Slice output to the correct shape speculative_logits = ( diff --git a/server/text_generation_server/models/model.py b/server/text_generation_server/models/model.py index 20402e07afa..b3630013568 100644 --- a/server/text_generation_server/models/model.py +++ b/server/text_generation_server/models/model.py @@ -5,8 +5,17 @@ from typing import List, Tuple, Optional, TypeVar, Type, Dict from collections import defaultdict from transformers import PreTrainedTokenizerBase - +from loguru import logger + +from text_generation_server.models.globals import ( + ATTENTION, + PREFIX_CACHING, + BLOCK_SIZE, + PREFILL_CHUNKING, +) from text_generation_server.models.types import Batch, Generation +from text_generation_server.utils.log import log_master +from text_generation_server.utils.prefill_chunking import set_support_chunking from text_generation_server.utils.speculate import get_speculate from text_generation_server.pb.generate_pb2 import InfoResponse from text_generation_server.adapters.weights import LayerAdapterWeights @@ -31,6 +40,7 @@ def __init__( sliding_window: Optional[int] = None, speculate: Optional[int] = None, adapter_id: str = BASE_MODEL_ADAPTER_ID, + support_chunking: bool = False, ): self.model_id = model_id self.model = model.eval() @@ -60,6 +70,29 @@ def __init__( speculate = get_speculate() self.speculate = speculate + support_chunking = support_chunking and PREFILL_CHUNKING + + if speculate != 0 and support_chunking: + log_master( + logger.warning, + "Prefill chunking does not support speculation yet. " + "Prefill chunking will be turned off", + ) + support_chunking = False + if ATTENTION not in ["flashinfer", "flashdecoding"] and support_chunking: + log_master( + logger.warning, + "Prefill chunking is only supported with `flashinfer` or `flashdecoding` attention types.", + ) + support_chunking = False + + log_master( + logger.info, f"Using experimental prefill chunking = {support_chunking}" + ) + + self.support_chunking = support_chunking + set_support_chunking(support_chunking) + self.has_position_ids = ( inspect.signature(model.forward).parameters.get("position_ids", None) is not None @@ -78,6 +111,10 @@ def info(self) -> InfoResponse: device_type=self.device.type, window_size=self.sliding_window, speculate=self.speculate, + support_chunking=self.support_chunking, + use_prefix_caching=PREFIX_CACHING, + attention_impl=ATTENTION, + block_size=BLOCK_SIZE, ) @property diff --git a/server/text_generation_server/models/seq2seq_lm.py b/server/text_generation_server/models/seq2seq_lm.py index 94f87d02350..3880a438c07 100644 --- a/server/text_generation_server/models/seq2seq_lm.py +++ b/server/text_generation_server/models/seq2seq_lm.py @@ -80,6 +80,7 @@ def to_pb(self) -> generate_pb2.CachedBatch: request_ids=[r.id for r in self.requests], size=len(self), max_tokens=self.max_tokens, + current_tokens=len(self.decoder_input_ids), ) @classmethod @@ -649,11 +650,7 @@ def fallback( model_id, revision=revision, torch_dtype=dtype, - device_map=( - "auto" - if device_count > 1 - else None - ), + device_map=("auto" if device_count > 1 else None), load_in_8bit=quantize == "bitsandbytes", trust_remote_code=trust_remote_code, ) diff --git a/server/text_generation_server/models/types.py b/server/text_generation_server/models/types.py index d4e7cca7504..ed9ae98959c 100644 --- a/server/text_generation_server/models/types.py +++ b/server/text_generation_server/models/types.py @@ -74,6 +74,14 @@ def to_pb(self) -> generate_pb2.Tokens: def __len__(self): return len(self.token_ids) + def __add__(self, other: "Tokens") -> "Tokens": + return Tokens( + self.token_ids + other.token_ids, + self.logprobs + other.logprobs, + self.texts + other.texts, + self.is_special + other.is_special, + ) + @dataclass class Generation: diff --git a/server/text_generation_server/models/vlm_causal_lm.py b/server/text_generation_server/models/vlm_causal_lm.py index 7f7d2e4d9f4..150cf0d07d7 100644 --- a/server/text_generation_server/models/vlm_causal_lm.py +++ b/server/text_generation_server/models/vlm_causal_lm.py @@ -271,6 +271,8 @@ def __init__( model_id=model_id, revision=revision, trust_remote_code=trust_remote_code, + # FIXME: VLM do not work with context chunking yet + support_chunking=False, **kwargs, ) @@ -295,7 +297,7 @@ def forward( block_tables = batch.block_tables_tensor slots = batch.slots[batch.slot_indices] input_lengths = batch.input_lengths_tensor - max_s = batch.max_seqlen + max_s = batch.max_current_length lm_head_indices = batch.prefill_head_indices speculative_ids = batch.speculative_ids @@ -314,8 +316,8 @@ def forward( input_lengths = ( input_lengths.unsqueeze(-1).expand(B, new_length) + arange_int ).view(-1) - prefix_lens_tensor = ( - batch.prefix_lens_tensor.unsqueeze(-1).expand(B, new_length) + cache_lengths_tensor = ( + batch.cache_lengths_tensor.unsqueeze(-1).expand(B, new_length) ).reshape(-1) # Add Copy the block tables for all members @@ -337,8 +339,8 @@ def forward( block_tables = batch.block_tables_tensor slots = batch.slots[batch.slot_indices] input_lengths = batch.input_lengths_tensor - prefix_lens_tensor = batch.prefix_lens_tensor - max_s = batch.max_seqlen + cache_lengths_tensor = batch.cache_lengths_tensor + max_s = batch.max_current_length lm_head_indices = batch.prefill_head_indices if cu_seqlen_prefill is None and self.max_past() is not None: @@ -347,7 +349,6 @@ def forward( # This makes sure the max_s for the decode pass is correct. max_s = min(self.max_past(), max_s) - bs = input_ids.shape[0] # Try to find an associated cuda graph bs = input_ids.shape[0] sorted_padded_bs = sorted([k for k in self.cuda_graphs.keys() if k >= bs]) @@ -357,26 +358,24 @@ def forward( else: cuda_graph = None if cu_seqlen_prefill is not None or cuda_graph is None: - input_lengths = input_lengths + prefix_lens_tensor - if PREFIX_CACHING: + if ATTENTION == "flashinfer": block_tables = block_tables_to_ragged( block_tables=block_tables, input_lengths=batch.input_lengths, - prefix_lens=batch.prefix_lens, + cache_lengths=batch.cache_lengths, ) with self._forward_context( block_tables=block_tables, cu_seqlen_prefill=cu_seqlen_prefill, input_lengths_tensor=input_lengths, - prefix_lens_tensor=prefix_lens_tensor, + cache_lengths_tensor=cache_lengths_tensor, ): - max_k = (input_lengths + prefix_lens_tensor).max().item() seqlen = Seqlen( input_lengths=input_lengths, - prefix_lengths=prefix_lens_tensor, + cache_lengths=cache_lengths_tensor, cu_seqlen_q=cu_seqlen_prefill, - max_q=max_s, - max_k=max_k, + max_q=batch.max_input_length, + max_k=batch.max_current_length, ) logits, speculative_logits = self.model.forward( input_ids=input_ids, @@ -411,22 +410,34 @@ def forward( block_tables = block_tables_to_ragged( block_tables=block_tables, input_lengths=batch.input_lengths, - prefix_lens=batch.prefix_lens, + cache_lengths=batch.cache_lengths, ) cuda_graph["block_tables"][: block_tables.shape[0]] = block_tables else: cuda_graph["block_tables"][ : block_tables.shape[0], : block_tables.shape[1] ] = block_tables - cuda_graph["slots"].fill_(-1) + + # XXX: This is working only because block 0 is reserved for the healthcheck + # so it doesn't matter if we override it with bogus values. + cuda_graph["slots"].fill_(0) cuda_graph["slots"][: slots.shape[0]] = slots cuda_graph["input_lengths"].zero_() - cuda_graph["input_lengths"][: input_lengths.shape[0]] = ( - input_lengths + prefix_lens_tensor - ) - - # Replay the graph - cuda_graph["graph"].replay() + cuda_graph["input_lengths"][: input_lengths.shape[0]] = input_lengths + cuda_graph["cache_lengths"].zero_() + cuda_graph["cache_lengths"][ + : cache_lengths_tensor.shape[0] + ] = cache_lengths_tensor + + with self._forward_context( + block_tables=cuda_graph["block_tables"], + cu_seqlen_prefill=None, + input_lengths_tensor=cuda_graph["input_lengths"], + cache_lengths_tensor=cuda_graph["cache_lengths"], + state=cuda_graph["state"], + ): + # Replay the graph + cuda_graph["graph"].replay() # Slice output to the correct shape speculative_logits = ( diff --git a/server/text_generation_server/server.py b/server/text_generation_server/server.py index 46e342a4fbf..aef00fb5f5d 100644 --- a/server/text_generation_server/server.py +++ b/server/text_generation_server/server.py @@ -15,6 +15,7 @@ from text_generation_server.interceptor import ExceptionInterceptor from text_generation_server.models import Model, get_model_with_lora_adapters from text_generation_server.utils.adapter import AdapterInfo +from text_generation_server.utils.prefill_chunking import set_max_prefill_tokens try: from text_generation_server.models.pali_gemma import PaliGemmaBatch @@ -46,9 +47,12 @@ def __init__(self): signal.signal(signal.SIGINT, self.exit_gracefully) signal.signal(signal.SIGTERM, self.exit_gracefully) + def set_keep_processing(self, value: bool): + self.KEEP_PROCESSING = value + def exit_gracefully(self, signum, frame): print(f"Exiting gracefully: Signal {signum}") - self.KEEP_PROCESSING = False + self.set_keep_processing(False) class TextGenerationService(generate_pb2_grpc.TextGenerationServiceServicer): @@ -96,6 +100,8 @@ async def FilterBatch(self, request, context): return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb()) async def Warmup(self, request, context): + set_max_prefill_tokens(request.max_prefill_tokens) + if self.quantize in {"exl2", "gptq"}: try: # When using GPTQ, Exllama kernels need some global kernels @@ -150,6 +156,18 @@ async def Prefill(self, request, context): request.batch, self.model.tokenizer, self.model.dtype, self.model.device ) + concat_ns = None + if self.model.support_chunking: + if request.HasField("cached_batch"): + cached_batch = self.cache.pop(request.cached_batch.id) + if cached_batch is None: + raise ValueError( + f"Batch ID {request.cached_batch.id} not found in cache." + ) + start_concat = time.time_ns() + batch = self.model.batch_type.concatenate([cached_batch, batch]) + concat_ns = time.time_ns() - start_concat + generations, next_batch, timings = self.model.generate_token(batch) self.cache.set(next_batch) @@ -159,6 +177,7 @@ async def Prefill(self, request, context): forward_ns=timings[0], decode_ns=timings[1], total_ns=time.time_ns() - start, + concat_ns=concat_ns, ) async def Decode(self, request, context): @@ -252,10 +271,12 @@ async def serve_inner( logger.exception("Error when initializing model") raise + signal_handler = SignalHandler() + set_adapter_to_index(adapter_to_index) server = aio.server( interceptors=[ - ExceptionInterceptor(), + ExceptionInterceptor(lambda: signal_handler.set_keep_processing(False)), UDSOpenTelemetryAioServerInterceptor(), ], options=[ @@ -276,7 +297,6 @@ async def serve_inner( await server.start() logger.info("Server started at {}".format(local_url)) - signal_handler = SignalHandler() while signal_handler.KEEP_PROCESSING: await asyncio.sleep(0.5) diff --git a/server/text_generation_server/utils/adapter.py b/server/text_generation_server/utils/adapter.py index 2b61f9bb448..09254b68a43 100644 --- a/server/text_generation_server/utils/adapter.py +++ b/server/text_generation_server/utils/adapter.py @@ -120,15 +120,18 @@ def _load_and_merge( if adapter.id == BASE_MODEL_ADAPTER_ID: raise ValueError("Base model adapter cannot be merged.") - module_map, adapter_config, adapter_weight_names, adapter_tokenizer = ( - load_module_map( - model_id, - adapter.revision, - adapter.id, - adapter.path, - weight_names, - trust_remote_code, - ) + ( + module_map, + adapter_config, + adapter_weight_names, + adapter_tokenizer, + ) = load_module_map( + model_id, + adapter.revision, + adapter.id, + adapter.path, + weight_names, + trust_remote_code, ) adapters_to_merge.append((module_map, adapter_config)) diff --git a/server/text_generation_server/utils/prefill_chunking.py b/server/text_generation_server/utils/prefill_chunking.py new file mode 100644 index 00000000000..c227d30f512 --- /dev/null +++ b/server/text_generation_server/utils/prefill_chunking.py @@ -0,0 +1,24 @@ +from typing import Optional + +SUPPORT_CHUNKING: Optional[bool] = None +MAX_PREFILL_TOKENS: Optional[int] = None + + +def set_support_chunking(support_chunking: bool): + global SUPPORT_CHUNKING + SUPPORT_CHUNKING = support_chunking + + +def get_support_chunking() -> bool: + global SUPPORT_CHUNKING + return SUPPORT_CHUNKING + + +def set_max_prefill_tokens(max_prefill_tokens: int): + global MAX_PREFILL_TOKENS + MAX_PREFILL_TOKENS = max_prefill_tokens + + +def get_max_prefill_tokens() -> int: + global MAX_PREFILL_TOKENS + return MAX_PREFILL_TOKENS diff --git a/server/text_generation_server/utils/segments.py b/server/text_generation_server/utils/segments.py index f5961102108..b3f923694e3 100644 --- a/server/text_generation_server/utils/segments.py +++ b/server/text_generation_server/utils/segments.py @@ -7,6 +7,7 @@ import torch +# FIXME: this should be optimized def find_segments( adapter_indices: Union[torch.Tensor, List[int]] ) -> Tuple[List[int], List[int]]: diff --git a/server/text_generation_server/utils/weights.py b/server/text_generation_server/utils/weights.py index 75e01f7ce05..aae64acf3da 100644 --- a/server/text_generation_server/utils/weights.py +++ b/server/text_generation_server/utils/weights.py @@ -197,7 +197,7 @@ def _get_slice(self, tensor_name: str): slice_ = f.get_slice(tensor_name) return slice_ - def _has_tensor(self, tensor_name: str): + def has_tensor(self, tensor_name: str): try: self.get_filename(tensor_name) except Exception: @@ -207,7 +207,9 @@ def _has_tensor(self, tensor_name: str): def get_shape(self, tensor_name: str): return self._get_slice(tensor_name).get_shape() - def get_tensor(self, tensor_name: str, to_device=True, to_dtype=True): + def get_tensor( + self, tensor_name: str, to_device: bool = True, to_dtype: bool = True + ) -> torch.Tensor: filename, tensor_name = self.get_filename(tensor_name) f = self._get_handle(filename) tensor = f.get_tensor(tensor_name) diff --git a/tgi-entrypoint.sh b/tgi-entrypoint.sh index ea94dcd9fd4..278c7d961ca 100755 --- a/tgi-entrypoint.sh +++ b/tgi-entrypoint.sh @@ -2,4 +2,4 @@ ldconfig 2>/dev/null || echo 'unable to refresh ld cache, not a big deal in most cases' -text-generation-launcher $@ +exec text-generation-launcher $@ diff --git a/update_doc.py b/update_doc.py index 203aaced025..6357cc0061b 100644 --- a/update_doc.py +++ b/update_doc.py @@ -172,6 +172,8 @@ def check_openapi(check: bool): # allow for trailing whitespace since it's not significant # and the precommit hook will remove it "lint", + "--skip-rule", + "security-defined", filename, ], capture_output=True,