From d20528a6fcc4abd2c13dd6bcf550ce09e6972984 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:02:43 -0400
Subject: [PATCH 01/26] Fix `task` type-hint and remove extra space in
 `logging`

---
 src/huggingface_inference_toolkit/utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
index a5ff7aee..acb368b9 100644
--- a/src/huggingface_inference_toolkit/utils.py
+++ b/src/huggingface_inference_toolkit/utils.py
@@ -208,7 +208,7 @@ def get_device():
 
 
 def get_pipeline(
-    task: str,
+    task: Union[str, None],
     model_dir: Path,
     **kwargs,
 ) -> Pipeline:
@@ -219,12 +219,13 @@ def get_pipeline(
     if is_optimum_neuron_available():
         logger.info("Using device Neuron")
     else:
-        logger.info(f"Using device { 'GPU' if device == 0 else 'CPU'}")
+        logger.info(f"Using device {'GPU' if device == 0 else 'CPU'}")
 
     if task is None:
         raise EnvironmentError(
             "The task for this model is not set: Please set one: https://huggingface.co/docs#how-is-a-models-type-of-inference-api-and-widget-determined"
         )
+
     # define tokenizer or feature extractor as kwargs to load it the pipeline
     # correctly
     if task in {

From be146c80af76c4004a89f3bbdfb126c4735be673 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:03:34 -0400
Subject: [PATCH 02/26] Align `transformers` and `diffusers` inputs with
 Inference API

---
 .../diffusers_utils.py                        | 28 ++++++---
 src/huggingface_inference_toolkit/handler.py  | 62 ++++++++++++++-----
 2 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/src/huggingface_inference_toolkit/diffusers_utils.py b/src/huggingface_inference_toolkit/diffusers_utils.py
index 70b683ab..47ddf390 100644
--- a/src/huggingface_inference_toolkit/diffusers_utils.py
+++ b/src/huggingface_inference_toolkit/diffusers_utils.py
@@ -22,9 +22,7 @@ def is_diffusers_available():
 
 
 class IEAutoPipelineForText2Image:
-    def __init__(
-        self, model_dir: str, device: Union[str, None] = None, **kwargs
-    ):  # needs "cuda" for GPU
+    def __init__(self, model_dir: str, device: Union[str, None] = None, **kwargs):  # needs "cuda" for GPU
         dtype = torch.float32
         if device == "cuda":
             dtype = torch.bfloat16 if is_torch_bf16_gpu_available() else torch.float16
@@ -36,9 +34,7 @@ def __init__(
         # try to use DPMSolverMultistepScheduler
         if isinstance(self.pipeline, StableDiffusionPipeline):
             try:
-                self.pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-                    self.pipeline.scheduler.config
-                )
+                self.pipeline.scheduler = DPMSolverMultistepScheduler.from_config(self.pipeline.scheduler.config)
             except Exception:
                 pass
 
@@ -47,6 +43,13 @@ def __call__(
         prompt,
         **kwargs,
     ):
+        if "prompt" in kwargs:
+            logger.warning(
+                "prompt has been provided twice, both via arg and kwargs, so the `prompt` arg will be used "
+                "instead, and the `prompt` in kwargs will be discarded."
+            )
+            kwargs.pop("prompt")
+
         # diffusers doesn't support seed but rather the generator kwarg
         # see: https://github.com/huggingface/api-inference-community/blob/8e577e2d60957959ba02f474b2913d84a9086b82/docker_images/diffusers/app/pipelines/text_to_image.py#L172-L176
         if "seed" in kwargs:
@@ -58,9 +61,16 @@ def __call__(
         # TODO: add support for more images (Reason is correct output)
         if "num_images_per_prompt" in kwargs:
             kwargs.pop("num_images_per_prompt")
-            logger.warning(
-                "Sending num_images_per_prompt > 1 to pipeline is not supported. Using default value 1."
-            )
+            logger.warning("Sending num_images_per_prompt > 1 to pipeline is not supported. Using default value 1.")
+
+        if "target_size" in kwargs:
+            kwargs["height"] = kwargs["target_size"].pop("height", None)
+            kwargs["width"] = kwargs["target_size"].pop("width", None)
+            kwargs.pop("target_size")
+
+        if "output_type" in kwargs and kwargs["output_type"] != "pil":
+            kwargs.pop("output_type")
+            logger.warning("The `output_type` cannot be modified, and PIL will be used by default instead.")
 
         # Call pipeline with parameters
         out = self.pipeline(prompt, num_images_per_prompt=1, **kwargs)
diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 636f185b..cf99b924 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -17,8 +17,8 @@ class HuggingFaceHandler:
 
     def __init__(self, model_dir: Union[str, Path], task=None, framework="pt"):
         self.pipeline = get_pipeline(
-            model_dir=model_dir,
-            task=task,
+            model_dir=model_dir,  # type: ignore
+            task=task,  # type: ignore
             framework=framework,
             trust_remote_code=HF_TRUST_REMOTE_CODE,
         )
@@ -31,15 +31,49 @@ def __call__(self, data):
         :return: prediction output
         """
         inputs = data.pop("inputs", data)
-        parameters = data.pop("parameters", None)
+        parameters = data.pop("parameters", {})
+
+        if self.pipeline.task == "question-answering" and (
+            not isinstance(inputs, dict) or not all(k in inputs for k in {"question", "context"})
+        ):
+            raise ValueError(
+                f"{self.pipeline.task} expects `inputs` to contain both `question` and `context` as the keys, "
+                "both of them being either a `str` or a `List[str]`."
+            )
 
-        # pass inputs with all kwargs in data
-        if parameters is not None:
-            prediction = self.pipeline(inputs, **parameters)
-        else:
-            prediction = self.pipeline(inputs)
-        # postprocess the prediction
-        return prediction
+        if self.pipeline.task == "table-question-answering":
+            if "question" in inputs:
+                inputs["query"] = inputs.pop("question")
+            if not all(k in inputs for k in {"table", "question"}):
+                raise ValueError(
+                    f"{self.pipeline.task} expects `inputs` to contain `table` and either `question` or `query`"
+                    " as the input parameters."
+                )
+
+        if self.pipeline.task in {"token-classification", "ner"}:
+            # stride and aggregation_strategy are defined on `pipeline` init, but in the Inference API those
+            # are provided on each request instead
+            pass
+
+        if self.pipeline.task.__contains__("translation"):
+            # truncation and generate_parameters are used on Inference API but not available on
+            # `TranslationPipeline.__call__` method
+            pass
+
+        if self.pipeline.task.__contains__("zero-shot-classification"):
+            if "candidateLabels" in inputs:
+                inputs["candidate_labels"] = inputs.pop("candidateLabels")
+            if "text" in inputs:
+                inputs["sequences"] = inputs.pop("text")
+            if not all(k in inputs for k in {"sequences", "candidate_labels"}):
+                raise ValueError(
+                    f"{self.pipeline.task} expects `inputs` to contain either `text` or `sequences` and either "
+                    "`candidate_labels` or `candidateLabels`."
+                )
+
+        if isinstance(inputs, dict):
+            return self.pipeline(**inputs, **parameters)
+        return self.pipeline(inputs, **parameters)
 
 
 class VertexAIHandler(HuggingFaceHandler):
@@ -59,9 +93,7 @@ def __call__(self, data):
         :return: prediction output
         """
         if "instances" not in data:
-            raise ValueError(
-                "The request body must contain a key 'instances' with a list of instances."
-            )
+            raise ValueError("The request body must contain a key 'instances' with a list of instances.")
         parameters = data.pop("parameters", None)
 
         predictions = []
@@ -74,9 +106,7 @@ def __call__(self, data):
         return {"predictions": predictions}
 
 
-def get_inference_handler_either_custom_or_default_handler(
-    model_dir: Path, task: Optional[str] = None
-):
+def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task: Optional[str] = None):
     """
     Returns the appropriate inference handler based on the given model directory and task.
 

From 0b61436b01017534478ef4c340a2a9890c3bb255 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:07:04 -0400
Subject: [PATCH 03/26] Remove duplicated `sentencepiece` extra requirement

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 63414346..793e89a7 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 # libavcodec-extra : libavcodec-extra  includes additional codecs for ffmpeg
 
 install_requires = [
-    "transformers[sklearn,sentencepiece,audio,vision,sentencepiece]==4.46.1",
+    "transformers[sklearn,sentencepiece,audio,vision]==4.46.1",
     "huggingface_hub[hf_transfer]==0.26.2",
     # vision
     "Pillow",

From 49254e97ea88ec69af4c38c8087d1e5eb06ff9bc Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Wed, 13 Nov 2024 11:50:24 -0400
Subject: [PATCH 04/26] Remove `pipeline.task` check for
 `sentence-transformers`

---
 src/huggingface_inference_toolkit/handler.py | 71 ++++++++++----------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index cf99b924..aa4cedaa 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -3,6 +3,7 @@
 from typing import Optional, Union
 
 from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
+from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
 from huggingface_inference_toolkit.utils import (
     check_and_register_custom_pipeline_from_directory,
     get_pipeline,
@@ -33,43 +34,45 @@ def __call__(self, data):
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", {})
 
-        if self.pipeline.task == "question-answering" and (
-            not isinstance(inputs, dict) or not all(k in inputs for k in {"question", "context"})
-        ):
-            raise ValueError(
-                f"{self.pipeline.task} expects `inputs` to contain both `question` and `context` as the keys, "
-                "both of them being either a `str` or a `List[str]`."
-            )
-
-        if self.pipeline.task == "table-question-answering":
-            if "question" in inputs:
-                inputs["query"] = inputs.pop("question")
-            if not all(k in inputs for k in {"table", "question"}):
+        # sentence transformers pipelines do not have the `task` arg
+        if not any(isinstance(self.pipeline, v) for v in SENTENCE_TRANSFORMERS_TASKS.values()):
+            if self.pipeline.task == "question-answering" and (
+                not isinstance(inputs, dict) or not all(k in inputs for k in {"question", "context"})
+            ):
                 raise ValueError(
-                    f"{self.pipeline.task} expects `inputs` to contain `table` and either `question` or `query`"
-                    " as the input parameters."
+                    f"{self.pipeline.task} expects `inputs` to contain both `question` and `context` as the keys, "
+                    "both of them being either a `str` or a `List[str]`."
                 )
 
-        if self.pipeline.task in {"token-classification", "ner"}:
-            # stride and aggregation_strategy are defined on `pipeline` init, but in the Inference API those
-            # are provided on each request instead
-            pass
-
-        if self.pipeline.task.__contains__("translation"):
-            # truncation and generate_parameters are used on Inference API but not available on
-            # `TranslationPipeline.__call__` method
-            pass
-
-        if self.pipeline.task.__contains__("zero-shot-classification"):
-            if "candidateLabels" in inputs:
-                inputs["candidate_labels"] = inputs.pop("candidateLabels")
-            if "text" in inputs:
-                inputs["sequences"] = inputs.pop("text")
-            if not all(k in inputs for k in {"sequences", "candidate_labels"}):
-                raise ValueError(
-                    f"{self.pipeline.task} expects `inputs` to contain either `text` or `sequences` and either "
-                    "`candidate_labels` or `candidateLabels`."
-                )
+            if self.pipeline.task == "table-question-answering":
+                if "question" in inputs:
+                    inputs["query"] = inputs.pop("question")
+                if not all(k in inputs for k in {"table", "question"}):
+                    raise ValueError(
+                        f"{self.pipeline.task} expects `inputs` to contain `table` and either `question` or `query`"
+                        " as the input parameters."
+                    )
+
+            if self.pipeline.task in {"token-classification", "ner"}:
+                # stride and aggregation_strategy are defined on `pipeline` init, but in the Inference API those
+                # are provided on each request instead
+                pass
+
+            if self.pipeline.task.__contains__("translation"):
+                # truncation and generate_parameters are used on Inference API but not available on
+                # `TranslationPipeline.__call__` method
+                pass
+
+            if self.pipeline.task.__contains__("zero-shot-classification"):
+                if "candidateLabels" in inputs:
+                    inputs["candidate_labels"] = inputs.pop("candidateLabels")
+                if "text" in inputs:
+                    inputs["sequences"] = inputs.pop("text")
+                if not all(k in inputs for k in {"sequences", "candidate_labels"}):
+                    raise ValueError(
+                        f"{self.pipeline.task} expects `inputs` to contain either `text` or `sequences` and either "
+                        "`candidate_labels` or `candidateLabels`."
+                    )
 
         if isinstance(inputs, dict):
             return self.pipeline(**inputs, **parameters)

From b45c40a47168addce8dbf7c498a8bb1edbf8eae2 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 09:47:06 +0100
Subject: [PATCH 05/26] Add `warning` and `pop` unsupported parameters

---
 src/huggingface_inference_toolkit/handler.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index aa4cedaa..949abea0 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -56,12 +56,19 @@ def __call__(self, data):
             if self.pipeline.task in {"token-classification", "ner"}:
                 # stride and aggregation_strategy are defined on `pipeline` init, but in the Inference API those
                 # are provided on each request instead
-                pass
+                for p in {"stride", "aggregation_strategy"}:
+                    if p in parameters:
+                        parameters.pop(p)
+                        logger.warning(f"provided parameter `{p}`, but it's not supported.")
 
             if self.pipeline.task.__contains__("translation"):
                 # truncation and generate_parameters are used on Inference API but not available on
                 # `TranslationPipeline.__call__` method
-                pass
+                for p in {"truncation", "generate_parameters"}:
+                    if p in parameters:
+                        parameters.pop(p)
+                        logger.warning(f"provided parameter `{p}`, but it's not supported.")
+
 
             if self.pipeline.task.__contains__("zero-shot-classification"):
                 if "candidateLabels" in inputs:

From b9dec3252c8faedc35b7f865295cd90972fa98ce Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 09:47:20 +0100
Subject: [PATCH 06/26] Fix `sentence-transformers` pipeline type-hints

---
 .../sentence_transformers_utils.py            | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/src/huggingface_inference_toolkit/sentence_transformers_utils.py b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
index 6b55ae76..e6a69de2 100644
--- a/src/huggingface_inference_toolkit/sentence_transformers_utils.py
+++ b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
@@ -1,4 +1,5 @@
 import importlib.util
+from typing import Any, Dict, List, Tuple, Union
 
 _sentence_transformers = importlib.util.find_spec("sentence_transformers") is not None
 
@@ -12,33 +13,34 @@ def is_sentence_transformers_available():
 
 
 class SentenceSimilarityPipeline:
-    def __init__(self, model_dir: str, device: str = None, **kwargs):  # needs "cuda" for GPU
+    def __init__(self, model_dir: str, device: Union[str, None] = None, **kwargs: Any) -> None:
+        # `device` needs to be set to "cuda" for GPU
         self.model = SentenceTransformer(model_dir, device=device, **kwargs)
 
-    def __call__(self, inputs=None):
-        embeddings1 = self.model.encode(
-            inputs["source_sentence"], convert_to_tensor=True
-        )
-        embeddings2 = self.model.encode(inputs["sentences"], convert_to_tensor=True)
+    def __call__(self, source_sentence: str, sentences: List[str]) -> Dict[str, float]:
+        embeddings1 = self.model.encode(source_sentence, convert_to_tensor=True)
+        embeddings2 = self.model.encode(sentences, convert_to_tensor=True)
         similarities = util.pytorch_cos_sim(embeddings1, embeddings2).tolist()[0]
         return {"similarities": similarities}
 
 
 class SentenceEmbeddingPipeline:
-    def __init__(self, model_dir: str, device: str = None, **kwargs):  # needs "cuda" for GPU
+    def __init__(self, model_dir: str, device: Union[str, None] = None, **kwargs: Any) -> None:
+        # `device` needs to be set to "cuda" for GPU
         self.model = SentenceTransformer(model_dir, device=device, **kwargs)
 
-    def __call__(self, inputs):
-        embeddings = self.model.encode(inputs).tolist()
+    def __call__(self, sentences: Union[str, List[str]]) -> Dict[str, List[float]]:
+        embeddings = self.model.encode(sentences).tolist()
         return {"embeddings": embeddings}
 
 
 class RankingPipeline:
-    def __init__(self, model_dir: str, device: str = None, **kwargs):  # needs "cuda" for GPU
+    def __init__(self, model_dir: str, device: Union[str, None] = None, **kwargs: Any) -> None:
+        # `device` needs to be set to "cuda" for GPU
         self.model = CrossEncoder(model_dir, device=device, **kwargs)
 
-    def __call__(self, inputs):
-        scores = self.model.predict(inputs).tolist()
+    def __call__(self, sentences: Union[List[List[str]], List[Tuple[str, str]]]) -> Dict[str, List[float]]:
+        scores = self.model.predict(sentences).tolist()
         return {"scores": scores}
 
 
@@ -56,9 +58,5 @@ def get_sentence_transformers_pipeline(task=None, model_dir=None, device=-1, **k
     kwargs.pop("framework", None)
 
     if task not in SENTENCE_TRANSFORMERS_TASKS:
-        raise ValueError(
-            f"Unknown task {task}. Available tasks are: {', '.join(SENTENCE_TRANSFORMERS_TASKS.keys())}"
-        )
-    return SENTENCE_TRANSFORMERS_TASKS[task](
-        model_dir=model_dir, device=device, **kwargs
-    )
+        raise ValueError(f"Unknown task {task}. Available tasks are: {', '.join(SENTENCE_TRANSFORMERS_TASKS.keys())}")
+    return SENTENCE_TRANSFORMERS_TASKS[task](model_dir=model_dir, device=device, **kwargs)

From 77c2bb2e2690a4a6c145c41be29f7fefa9ec8245 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 10:10:11 +0100
Subject: [PATCH 07/26] Update `sentence-ranking` type-hints

---
 .../sentence_transformers_utils.py                            | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/huggingface_inference_toolkit/sentence_transformers_utils.py b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
index e6a69de2..91eda151 100644
--- a/src/huggingface_inference_toolkit/sentence_transformers_utils.py
+++ b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
@@ -39,7 +39,9 @@ def __init__(self, model_dir: str, device: Union[str, None] = None, **kwargs: An
         # `device` needs to be set to "cuda" for GPU
         self.model = CrossEncoder(model_dir, device=device, **kwargs)
 
-    def __call__(self, sentences: Union[List[List[str]], List[Tuple[str, str]]]) -> Dict[str, List[float]]:
+    def __call__(
+        self, sentences: Union[Tuple[str, str], List[str], List[List[str]], List[Tuple[str, str]]]
+    ) -> Dict[str, List[float]]:
         scores = self.model.predict(sentences).tolist()
         return {"scores": scores}
 

From 9b4fc6784af0d1f32045cc30c2448bf228a5f899 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 10:10:41 +0100
Subject: [PATCH 08/26] Add missing type-hints and clear code a bit

---
 src/huggingface_inference_toolkit/handler.py | 37 ++++++++------
 src/huggingface_inference_toolkit/utils.py   | 54 ++++++++------------
 2 files changed, 41 insertions(+), 50 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 949abea0..bce48d96 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -1,8 +1,9 @@
 import os
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any, Dict, Literal, Optional, Union
 
 from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
+from huggingface_inference_toolkit.logging import logger
 from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
 from huggingface_inference_toolkit.utils import (
     check_and_register_custom_pipeline_from_directory,
@@ -13,10 +14,12 @@
 class HuggingFaceHandler:
     """
     A Default Hugging Face Inference Handler which works with all
-    transformers pipelines, Sentence Transformers and Optimum.
+    Transformers, Diffusers, Sentence Transformers and Optimum pipelines.
     """
 
-    def __init__(self, model_dir: Union[str, Path], task=None, framework="pt"):
+    def __init__(
+        self, model_dir: Union[str, Path], task: Union[str, None] = None, framework: Literal["pt"] = "pt"
+    ) -> None:
         self.pipeline = get_pipeline(
             model_dir=model_dir,  # type: ignore
             task=task,  # type: ignore
@@ -24,7 +27,7 @@ def __init__(self, model_dir: Union[str, Path], task=None, framework="pt"):
             trust_remote_code=HF_TRUST_REMOTE_CODE,
         )
 
-    def __call__(self, data):
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         Handles an inference request with input data and makes a prediction.
         Args:
@@ -69,7 +72,6 @@ def __call__(self, data):
                         parameters.pop(p)
                         logger.warning(f"provided parameter `{p}`, but it's not supported.")
 
-
             if self.pipeline.task.__contains__("zero-shot-classification"):
                 if "candidateLabels" in inputs:
                     inputs["candidate_labels"] = inputs.pop("candidateLabels")
@@ -81,9 +83,9 @@ def __call__(self, data):
                         "`candidate_labels` or `candidateLabels`."
                     )
 
-        if isinstance(inputs, dict):
-            return self.pipeline(**inputs, **parameters)
-        return self.pipeline(inputs, **parameters)
+        return (
+            self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else self.pipeline(inputs, **parameters)  # type: ignore
+        )
 
 
 class VertexAIHandler(HuggingFaceHandler):
@@ -92,10 +94,12 @@ class VertexAIHandler(HuggingFaceHandler):
     Vertex AI specific logic for inference.
     """
 
-    def __init__(self, model_dir: Union[str, Path], task=None, framework="pt"):
-        super().__init__(model_dir, task, framework)
+    def __init__(
+        self, model_dir: Union[str, Path], task: Union[str, None] = None, framework: Literal["pt"] = "pt"
+    ) -> None:
+        super().__init__(model_dir=model_dir, task=task, framework=framework)
 
-    def __call__(self, data):
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         Handles an inference request with input data and makes a prediction.
         Args:
@@ -116,7 +120,7 @@ def __call__(self, data):
         return {"predictions": predictions}
 
 
-def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task: Optional[str] = None):
+def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task: Optional[str] = None) -> Any:
     """
     Returns the appropriate inference handler based on the given model directory and task.
 
@@ -128,9 +132,10 @@ def get_inference_handler_either_custom_or_default_handler(model_dir: Path, task
         InferenceHandler: The appropriate inference handler based on the given model directory and task.
     """
     custom_pipeline = check_and_register_custom_pipeline_from_directory(model_dir)
-    if custom_pipeline:
+    if custom_pipeline is not None:
         return custom_pipeline
-    elif os.environ.get("AIP_MODE", None) == "PREDICTION":
+
+    if os.environ.get("AIP_MODE", None) == "PREDICTION":
         return VertexAIHandler(model_dir=model_dir, task=task)
-    else:
-        return HuggingFaceHandler(model_dir=model_dir, task=task)
+
+    return HuggingFaceHandler(model_dir=model_dir, task=task)
diff --git a/src/huggingface_inference_toolkit/utils.py b/src/huggingface_inference_toolkit/utils.py
index acb368b9..8dc644c7 100644
--- a/src/huggingface_inference_toolkit/utils.py
+++ b/src/huggingface_inference_toolkit/utils.py
@@ -134,9 +134,7 @@ def _load_repository_from_hf(
 
     # create regex to only include the framework specific weights
     ignore_regex = create_artifact_filter(framework)
-    logger.info(
-        f"Ignore regex pattern for files, which are not downloaded: { ', '.join(ignore_regex) }"
-    )
+    logger.info(f"Ignore regex pattern for files, which are not downloaded: { ', '.join(ignore_regex) }")
 
     # Download the repository to the workdir and filter out non-framework
     # specific weights
@@ -177,9 +175,7 @@ def check_and_register_custom_pipeline_from_directory(model_dir):
             Please update to the new format.
             See documentation for more information."""
         )
-        spec = importlib.util.spec_from_file_location(
-            "pipeline.PreTrainedPipeline", legacy_module
-        )
+        spec = importlib.util.spec_from_file_location("pipeline.PreTrainedPipeline", legacy_module)
         if spec:
             # add the whole directory to path for submodlues
             sys.path.insert(0, model_dir)
@@ -215,17 +211,21 @@ def get_pipeline(
     """
     create pipeline class for a specific task based on local saved model
     """
-    device = get_device()
-    if is_optimum_neuron_available():
-        logger.info("Using device Neuron")
-    else:
-        logger.info(f"Using device {'GPU' if device == 0 else 'CPU'}")
-
     if task is None:
         raise EnvironmentError(
             "The task for this model is not set: Please set one: https://huggingface.co/docs#how-is-a-models-type-of-inference-api-and-widget-determined"
         )
 
+    if task == "conversational":
+        task = "text-generation"
+
+    if is_optimum_neuron_available():
+        logger.info("Using device Neuron")
+        return get_optimum_neuron_pipeline(task=task, model_dir=model_dir)
+
+    device = get_device()
+    logger.info(f"Using device {'GPU' if device == 0 else 'CPU'}")
+
     # define tokenizer or feature extractor as kwargs to load it the pipeline
     # correctly
     if task in {
@@ -237,41 +237,27 @@ def get_pipeline(
         "zero-shot-image-classification",
     }:
         kwargs["feature_extractor"] = model_dir
-    elif task in {"image-to-text", "text-to-image"}:
-        pass
-    elif task == "conversational":
-        task = "text-generation"
-    else:
+    elif task not in {"image-to-text", "text-to-image"}:
         kwargs["tokenizer"] = model_dir
 
-    if is_optimum_neuron_available():
-        hf_pipeline = get_optimum_neuron_pipeline(task=task, model_dir=model_dir)
-    elif is_sentence_transformers_available() and task in [
+    if is_sentence_transformers_available() and task in [
         "sentence-similarity",
         "sentence-embeddings",
         "sentence-ranking",
     ]:
-        hf_pipeline = get_sentence_transformers_pipeline(
-            task=task, model_dir=model_dir, device=device, **kwargs
-        )
+        hf_pipeline = get_sentence_transformers_pipeline(task=task, model_dir=model_dir, device=device, **kwargs)
     elif is_diffusers_available() and task == "text-to-image":
-        hf_pipeline = get_diffusers_pipeline(
-            task=task, model_dir=model_dir, device=device, **kwargs
-        )
+        hf_pipeline = get_diffusers_pipeline(task=task, model_dir=model_dir, device=device, **kwargs)
     else:
         hf_pipeline = pipeline(task=task, model=model_dir, device=device, **kwargs)
 
-    if task == "automatic-speech-recognition" and isinstance(
-        hf_pipeline.model, WhisperForConditionalGeneration
-    ):
+    if task == "automatic-speech-recognition" and isinstance(hf_pipeline.model, WhisperForConditionalGeneration):
         # set chunk length to 30s for whisper to enable long audio files
         hf_pipeline._preprocess_params["chunk_length_s"] = 30
-        hf_pipeline.model.config.forced_decoder_ids = (
-            hf_pipeline.tokenizer.get_decoder_prompt_ids(
-                language="english", task="transcribe"
-            )
+        hf_pipeline.model.config.forced_decoder_ids = hf_pipeline.tokenizer.get_decoder_prompt_ids(
+            language="english", task="transcribe"
         )
-    return hf_pipeline
+    return hf_pipeline  # type: ignore
 
 
 def convert_params_to_int_or_bool(params):

From c1d519a1abe3ca2c0b7876807570fb6399a06823 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 10:10:53 +0100
Subject: [PATCH 09/26] Fix failing `sentence-transformers` tests due to input
 parsing

---
 tests/unit/test_sentence_transformers.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tests/unit/test_sentence_transformers.py b/tests/unit/test_sentence_transformers.py
index 635f39de..23c08e58 100644
--- a/tests/unit/test_sentence_transformers.py
+++ b/tests/unit/test_sentence_transformers.py
@@ -15,9 +15,7 @@
 @require_torch
 def test_get_sentence_transformers_pipeline():
     with tempfile.TemporaryDirectory() as tmpdirname:
-        storage_dir = _load_repository_from_hf(
-            "sentence-transformers/all-MiniLM-L6-v2", tmpdirname
-        )
+        storage_dir = _load_repository_from_hf("sentence-transformers/all-MiniLM-L6-v2", tmpdirname)
         pipe = get_pipeline("sentence-embeddings", storage_dir.as_posix())
         assert isinstance(pipe, SentenceEmbeddingPipeline)
 
@@ -25,9 +23,7 @@ def test_get_sentence_transformers_pipeline():
 @require_torch
 def test_sentence_embedding_task():
     with tempfile.TemporaryDirectory() as tmpdirname:
-        storage_dir = _load_repository_from_hf(
-            "sentence-transformers/all-MiniLM-L6-v2", tmpdirname
-        )
+        storage_dir = _load_repository_from_hf("sentence-transformers/all-MiniLM-L6-v2", tmpdirname)
         pipe = get_sentence_transformers_pipeline("sentence-embeddings", storage_dir.as_posix())
         res = pipe("Lets create an embedding")
         assert isinstance(res["embeddings"], list)
@@ -36,11 +32,9 @@ def test_sentence_embedding_task():
 @require_torch
 def test_sentence_similarity():
     with tempfile.TemporaryDirectory() as tmpdirname:
-        storage_dir = _load_repository_from_hf(
-            "sentence-transformers/all-MiniLM-L6-v2", tmpdirname
-        )
+        storage_dir = _load_repository_from_hf("sentence-transformers/all-MiniLM-L6-v2", tmpdirname)
         pipe = get_sentence_transformers_pipeline("sentence-similarity", storage_dir.as_posix())
-        res = pipe({"source_sentence": "Lets create an embedding", "sentences": ["Lets create an embedding"]})
+        res = pipe(**{"source_sentence": "Lets create an embedding", "sentences": ["Lets create an embedding"]})
         assert isinstance(res["similarities"], list)
 
 

From 7f0d84d16df9c41867dd9c3f7e0637f188014af1 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 10:32:59 +0100
Subject: [PATCH 10/26] Fix "table-question-answering" payload check

---
 src/huggingface_inference_toolkit/handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index bce48d96..0ef71840 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -50,7 +50,7 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
             if self.pipeline.task == "table-question-answering":
                 if "question" in inputs:
                     inputs["query"] = inputs.pop("question")
-                if not all(k in inputs for k in {"table", "question"}):
+                if not all(k in inputs for k in {"table", "query"}):
                     raise ValueError(
                         f"{self.pipeline.task} expects `inputs` to contain `table` and either `question` or `query`"
                         " as the input parameters."

From 307b27f9135c2d42169acf71e47c7aa5dda49d93 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 10:59:41 +0100
Subject: [PATCH 11/26] Fix "zero-shot-classification" payload check

---
 src/huggingface_inference_toolkit/handler.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 0ef71840..002cd313 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -73,14 +73,16 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                         logger.warning(f"provided parameter `{p}`, but it's not supported.")
 
             if self.pipeline.task.__contains__("zero-shot-classification"):
-                if "candidateLabels" in inputs:
-                    inputs["candidate_labels"] = inputs.pop("candidateLabels")
+                if "candidateLabels" in parameters:
+                    parameters["candidate_labels"] = parameters.pop("candidateLabels")
                 if "text" in inputs:
                     inputs["sequences"] = inputs.pop("text")
-                if not all(k in inputs for k in {"sequences", "candidate_labels"}):
+                if not all(k in inputs for k in {"sequences", "parameters"}) or not all(
+                    k in parameters for k in {"candidate_labels"}
+                ):
                     raise ValueError(
-                        f"{self.pipeline.task} expects `inputs` to contain either `text` or `sequences` and either "
-                        "`candidate_labels` or `candidateLabels`."
+                        f"{self.pipeline.task} expects `inputs` to contain either `text` or `sequences` and "
+                        "`parameters` to contain either `candidate_labels` or `candidateLabels`."
                     )
 
         return (

From d3d2b5ede3c6dd0dc9462e152e2e6eb389d8173e Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 11:30:05 +0100
Subject: [PATCH 12/26] Check that payload is `dict` in advance

---
 src/huggingface_inference_toolkit/handler.py | 27 ++++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 002cd313..a0b4a395 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -43,17 +43,17 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                 not isinstance(inputs, dict) or not all(k in inputs for k in {"question", "context"})
             ):
                 raise ValueError(
-                    f"{self.pipeline.task} expects `inputs` to contain both `question` and `context` as the keys, "
-                    "both of them being either a `str` or a `List[str]`."
+                    f"{self.pipeline.task} expects `inputs` to be a dict containing both `question` and "
+                    "`context` as the keys, both of them being either a `str` or a `List[str]`."
                 )
 
             if self.pipeline.task == "table-question-answering":
-                if "question" in inputs:
+                if isinstance(inputs, dict) and "question" in inputs:
                     inputs["query"] = inputs.pop("question")
-                if not all(k in inputs for k in {"table", "query"}):
+                if not isinstance(inputs, dict) or not all(k in inputs for k in {"table", "query"}):
                     raise ValueError(
-                        f"{self.pipeline.task} expects `inputs` to contain `table` and either `question` or `query`"
-                        " as the input parameters."
+                        f"{self.pipeline.task} expects `inputs` to be a dict containing the keys `table` and "
+                        "either `question` or `query`."
                     )
 
             if self.pipeline.task in {"token-classification", "ner"}:
@@ -75,14 +75,19 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
             if self.pipeline.task.__contains__("zero-shot-classification"):
                 if "candidateLabels" in parameters:
                     parameters["candidate_labels"] = parameters.pop("candidateLabels")
-                if "text" in inputs:
+                if not isinstance(inputs, dict) and (isinstance(inputs, str) or isinstance(inputs, list)):
+                    inputs = {"sequences": inputs}
+                if isinstance(inputs, dict) and "text" in inputs:
                     inputs["sequences"] = inputs.pop("text")
-                if not all(k in inputs for k in {"sequences", "parameters"}) or not all(
-                    k in parameters for k in {"candidate_labels"}
+                if (
+                    not isinstance(inputs, dict)
+                    or not all(k in inputs for k in {"sequences", "parameters"})
+                    or not all(k in parameters for k in {"candidate_labels"})
                 ):
                     raise ValueError(
-                        f"{self.pipeline.task} expects `inputs` to contain either `text` or `sequences` and "
-                        "`parameters` to contain either `candidate_labels` or `candidateLabels`."
+                        f"{self.pipeline.task} expects `inputs` to be a dict containing the key `text` or "
+                        "`sequences`, and `parameters` to be another dict containing either `candidate_labels` "
+                        "or `candidateLabels`."
                     )
 
         return (

From 64cbeb10219036a314a9da6a8630ba710dff5727 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 15:49:39 +0100
Subject: [PATCH 13/26] Fix `HuggingFaceHandler` errors and checks

---
 src/huggingface_inference_toolkit/handler.py | 35 ++++++++++----------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index a0b4a395..72e68382 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -39,18 +39,21 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
 
         # sentence transformers pipelines do not have the `task` arg
         if not any(isinstance(self.pipeline, v) for v in SENTENCE_TRANSFORMERS_TASKS.values()):
-            if self.pipeline.task == "question-answering" and (
-                not isinstance(inputs, dict) or not all(k in inputs for k in {"question", "context"})
-            ):
-                raise ValueError(
-                    f"{self.pipeline.task} expects `inputs` to be a dict containing both `question` and "
-                    "`context` as the keys, both of them being either a `str` or a `List[str]`."
-                )
+            if self.pipeline.task == "question-answering":
+                if not isinstance(inputs, dict):
+                    raise ValueError(f"inputs must be a dict, but a `{type(inputs)}` was provided instead.")
+                if not all(k in inputs for k in {"question", "context"}):
+                    raise ValueError(
+                        f"{self.pipeline.task} expects `inputs` to be a dict containing both `question` and "
+                        "`context` as the keys, both of them being either a `str` or a `List[str]`."
+                    )
 
             if self.pipeline.task == "table-question-answering":
-                if isinstance(inputs, dict) and "question" in inputs:
+                if not isinstance(inputs, dict):
+                    raise ValueError(f"inputs must be a dict, but a `{type(inputs)}` was provided instead.")
+                if "question" in inputs:
                     inputs["query"] = inputs.pop("question")
-                if not isinstance(inputs, dict) or not all(k in inputs for k in {"table", "query"}):
+                if not all(k in inputs for k in {"table", "query"}):
                     raise ValueError(
                         f"{self.pipeline.task} expects `inputs` to be a dict containing the keys `table` and "
                         "either `question` or `query`."
@@ -75,18 +78,16 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
             if self.pipeline.task.__contains__("zero-shot-classification"):
                 if "candidateLabels" in parameters:
                     parameters["candidate_labels"] = parameters.pop("candidateLabels")
-                if not isinstance(inputs, dict) and (isinstance(inputs, str) or isinstance(inputs, list)):
+                if not isinstance(inputs, dict):
                     inputs = {"sequences": inputs}
-                if isinstance(inputs, dict) and "text" in inputs:
+                if "text" in inputs:
                     inputs["sequences"] = inputs.pop("text")
-                if (
-                    not isinstance(inputs, dict)
-                    or not all(k in inputs for k in {"sequences", "parameters"})
-                    or not all(k in parameters for k in {"candidate_labels"})
+                if not all(k in inputs for k in {"sequences"}) or not all(
+                    k in parameters for k in {"candidate_labels"}
                 ):
                     raise ValueError(
-                        f"{self.pipeline.task} expects `inputs` to be a dict containing the key `text` or "
-                        "`sequences`, and `parameters` to be another dict containing either `candidate_labels` "
+                        f"{self.pipeline.task} expects `inputs` to be either a string or a dict containing the "
+                        "key `text` or `sequences`, and `parameters` to be a dict containing either `candidate_labels` "
                         "or `candidateLabels`."
                     )
 

From 8cbd4bec8fb3f4777e7a05e31ff7ff20bd8cc2ab Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 16:20:35 +0100
Subject: [PATCH 14/26] Fix `sentence-transformers` pipelines as those don't
 have parameters

---
 src/huggingface_inference_toolkit/handler.py | 104 +++++++++----------
 1 file changed, 52 insertions(+), 52 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 72e68382..617ca789 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -38,58 +38,58 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         parameters = data.pop("parameters", {})
 
         # sentence transformers pipelines do not have the `task` arg
-        if not any(isinstance(self.pipeline, v) for v in SENTENCE_TRANSFORMERS_TASKS.values()):
-            if self.pipeline.task == "question-answering":
-                if not isinstance(inputs, dict):
-                    raise ValueError(f"inputs must be a dict, but a `{type(inputs)}` was provided instead.")
-                if not all(k in inputs for k in {"question", "context"}):
-                    raise ValueError(
-                        f"{self.pipeline.task} expects `inputs` to be a dict containing both `question` and "
-                        "`context` as the keys, both of them being either a `str` or a `List[str]`."
-                    )
-
-            if self.pipeline.task == "table-question-answering":
-                if not isinstance(inputs, dict):
-                    raise ValueError(f"inputs must be a dict, but a `{type(inputs)}` was provided instead.")
-                if "question" in inputs:
-                    inputs["query"] = inputs.pop("question")
-                if not all(k in inputs for k in {"table", "query"}):
-                    raise ValueError(
-                        f"{self.pipeline.task} expects `inputs` to be a dict containing the keys `table` and "
-                        "either `question` or `query`."
-                    )
-
-            if self.pipeline.task in {"token-classification", "ner"}:
-                # stride and aggregation_strategy are defined on `pipeline` init, but in the Inference API those
-                # are provided on each request instead
-                for p in {"stride", "aggregation_strategy"}:
-                    if p in parameters:
-                        parameters.pop(p)
-                        logger.warning(f"provided parameter `{p}`, but it's not supported.")
-
-            if self.pipeline.task.__contains__("translation"):
-                # truncation and generate_parameters are used on Inference API but not available on
-                # `TranslationPipeline.__call__` method
-                for p in {"truncation", "generate_parameters"}:
-                    if p in parameters:
-                        parameters.pop(p)
-                        logger.warning(f"provided parameter `{p}`, but it's not supported.")
-
-            if self.pipeline.task.__contains__("zero-shot-classification"):
-                if "candidateLabels" in parameters:
-                    parameters["candidate_labels"] = parameters.pop("candidateLabels")
-                if not isinstance(inputs, dict):
-                    inputs = {"sequences": inputs}
-                if "text" in inputs:
-                    inputs["sequences"] = inputs.pop("text")
-                if not all(k in inputs for k in {"sequences"}) or not all(
-                    k in parameters for k in {"candidate_labels"}
-                ):
-                    raise ValueError(
-                        f"{self.pipeline.task} expects `inputs` to be either a string or a dict containing the "
-                        "key `text` or `sequences`, and `parameters` to be a dict containing either `candidate_labels` "
-                        "or `candidateLabels`."
-                    )
+        if any(isinstance(self.pipeline, v) for v in SENTENCE_TRANSFORMERS_TASKS.values()):
+            return self.pipeline(**inputs) if isinstance(inputs, dict) else self.pipeline(inputs)  # type: ignore
+
+        if self.pipeline.task == "question-answering":
+            if not isinstance(inputs, dict):
+                raise ValueError(f"inputs must be a dict, but a `{type(inputs)}` was provided instead.")
+            if not all(k in inputs for k in {"question", "context"}):
+                raise ValueError(
+                    f"{self.pipeline.task} expects `inputs` to be a dict containing both `question` and "
+                    "`context` as the keys, both of them being either a `str` or a `List[str]`."
+                )
+
+        if self.pipeline.task == "table-question-answering":
+            if not isinstance(inputs, dict):
+                raise ValueError(f"inputs must be a dict, but a `{type(inputs)}` was provided instead.")
+            if "question" in inputs:
+                inputs["query"] = inputs.pop("question")
+            if not all(k in inputs for k in {"table", "query"}):
+                raise ValueError(
+                    f"{self.pipeline.task} expects `inputs` to be a dict containing the keys `table` and "
+                    "either `question` or `query`."
+                )
+
+        if self.pipeline.task in {"token-classification", "ner"}:
+            # stride and aggregation_strategy are defined on `pipeline` init, but in the Inference API those
+            # are provided on each request instead
+            for p in {"stride", "aggregation_strategy"}:
+                if p in parameters:
+                    parameters.pop(p)
+                    logger.warning(f"provided parameter `{p}`, but it's not supported.")
+
+        if self.pipeline.task.__contains__("translation"):
+            # truncation and generate_parameters are used on Inference API but not available on
+            # `TranslationPipeline.__call__` method
+            for p in {"truncation", "generate_parameters"}:
+                if p in parameters:
+                    parameters.pop(p)
+                    logger.warning(f"provided parameter `{p}`, but it's not supported.")
+
+        if self.pipeline.task.__contains__("zero-shot-classification"):
+            if "candidateLabels" in parameters:
+                parameters["candidate_labels"] = parameters.pop("candidateLabels")
+            if not isinstance(inputs, dict):
+                inputs = {"sequences": inputs}
+            if "text" in inputs:
+                inputs["sequences"] = inputs.pop("text")
+            if not all(k in inputs for k in {"sequences"}) or not all(k in parameters for k in {"candidate_labels"}):
+                raise ValueError(
+                    f"{self.pipeline.task} expects `inputs` to be either a string or a dict containing the "
+                    "key `text` or `sequences`, and `parameters` to be a dict containing either `candidate_labels` "
+                    "or `candidateLabels`."
+                )
 
         return (
             self.pipeline(**inputs, **parameters) if isinstance(inputs, dict) else self.pipeline(inputs, **parameters)  # type: ignore

From 0053e977a9dd7f6fc5f6631cf752be7e67a035bf Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 16:47:41 +0100
Subject: [PATCH 15/26] Fix `INPUT` to `input_data` fixture

---
 tests/unit/test_handler.py | 60 +++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 34 deletions(-)

diff --git a/tests/unit/test_handler.py b/tests/unit/test_handler.py
index 052a5bfc..531ac5fc 100644
--- a/tests/unit/test_handler.py
+++ b/tests/unit/test_handler.py
@@ -1,6 +1,7 @@
 import tempfile
 
 import pytest
+from typing import Dict
 from transformers.testing_utils import require_tf, require_torch
 
 from huggingface_inference_toolkit.handler import (
@@ -11,14 +12,20 @@
     _is_gpu_available,
     _load_repository_from_hf,
 )
+from huggingface_inference_toolkit.logging import logger
 
 TASK = "text-classification"
 MODEL = "hf-internal-testing/tiny-random-distilbert"
-INPUT = {"inputs": "My name is Wolfgang and I live in Berlin"}
+
+
+# defined as fixture because it's modified on `pop`
+@pytest.fixture
+def input_data():
+    return {"inputs": "My name is Wolfgang and I live in Berlin"}
 
 
 @require_torch
-def test_pt_get_device():
+def test_pt_get_device() -> None:
     import torch
 
     with tempfile.TemporaryDirectory() as tmpdirname:
@@ -32,52 +39,45 @@ def test_pt_get_device():
 
 
 @require_torch
-def test_pt_predict_call():
+def test_pt_predict_call(input_data: Dict[str, str]) -> None:
     with tempfile.TemporaryDirectory() as tmpdirname:
         # https://github.com/huggingface/infinity/blob/test-ovh/test/integ/utils.py
         storage_dir = _load_repository_from_hf(MODEL, tmpdirname, framework="pytorch")
         h = HuggingFaceHandler(model_dir=str(storage_dir), task=TASK)
 
-        prediction = h(INPUT)
+        prediction = h(input_data)
         assert "label" in prediction[0]
         assert "score" in prediction[0]
 
 
 @require_torch
-def test_pt_custom_pipeline():
+def test_pt_custom_pipeline(input_data: Dict[str, str]) -> None:
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf(
             "philschmid/custom-pipeline-text-classification",
             tmpdirname,
             framework="pytorch",
         )
-        h = get_inference_handler_either_custom_or_default_handler(
-            str(storage_dir), task="custom"
-        )
-        assert h(INPUT) == INPUT
+        h = get_inference_handler_either_custom_or_default_handler(str(storage_dir), task="custom")
+        assert h(input_data) == input_data
 
 
 @require_torch
-def test_pt_sentence_transformers_pipeline():
+def test_pt_sentence_transformers_pipeline(input_data: Dict[str, str]) -> None:
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf(
             "sentence-transformers/all-MiniLM-L6-v2", tmpdirname, framework="pytorch"
         )
-        h = get_inference_handler_either_custom_or_default_handler(
-            str(storage_dir), task="sentence-embeddings"
-        )
-        pred = h(INPUT)
+        h = get_inference_handler_either_custom_or_default_handler(str(storage_dir), task="sentence-embeddings")
+        pred = h(input_data)
         assert isinstance(pred["embeddings"], list)
 
 
 @require_tf
 def test_tf_get_device():
-
     with tempfile.TemporaryDirectory() as tmpdirname:
         # https://github.com/huggingface/infinity/blob/test-ovh/test/integ/utils.py
-        storage_dir = _load_repository_from_hf(
-            MODEL, tmpdirname, framework="tensorflow"
-        )
+        storage_dir = _load_repository_from_hf(MODEL, tmpdirname, framework="tensorflow")
         h = HuggingFaceHandler(model_dir=str(storage_dir), task=TASK)
         if _is_gpu_available():
             assert h.pipeline.device == 0
@@ -86,33 +86,27 @@ def test_tf_get_device():
 
 
 @require_tf
-def test_tf_predict_call():
+def test_tf_predict_call(input_data: Dict[str, str]) -> None:
     with tempfile.TemporaryDirectory() as tmpdirname:
         # https://github.com/huggingface/infinity/blob/test-ovh/test/integ/utils.py
-        storage_dir = _load_repository_from_hf(
-            MODEL, tmpdirname, framework="tensorflow"
-        )
-        handler = HuggingFaceHandler(
-            model_dir=str(storage_dir), task=TASK, framework="tf"
-        )
+        storage_dir = _load_repository_from_hf(MODEL, tmpdirname, framework="tensorflow")
+        handler = HuggingFaceHandler(model_dir=str(storage_dir), task=TASK, framework="tf")
 
-        prediction = handler(INPUT)
+        prediction = handler(input_data)
         assert "label" in prediction[0]
         assert "score" in prediction[0]
 
 
 @require_tf
-def test_tf_custom_pipeline():
+def test_tf_custom_pipeline(input_data: Dict[str, str]) -> None:
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf(
             "philschmid/custom-pipeline-text-classification",
             tmpdirname,
             framework="tensorflow",
         )
-        h = get_inference_handler_either_custom_or_default_handler(
-            str(storage_dir), task="custom"
-        )
-        assert h(INPUT) == INPUT
+        h = get_inference_handler_either_custom_or_default_handler(str(storage_dir), task="custom")
+        assert h(input_data) == input_data
 
 
 @require_tf
@@ -123,6 +117,4 @@ def test_tf_sentence_transformers_pipeline():
             "sentence-transformers/all-MiniLM-L6-v2", tmpdirname, framework="tensorflow"
         )
         with pytest.raises(Exception) as _exc_info:
-            get_inference_handler_either_custom_or_default_handler(
-                str(storage_dir), task="sentence-embeddings"
-            )
+            get_inference_handler_either_custom_or_default_handler(str(storage_dir), task="sentence-embeddings")

From b9dbf58f41505be3919c60633e1e5938070ae64d Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Fri, 15 Nov 2024 16:53:42 +0100
Subject: [PATCH 16/26] Fix quality in `tests/unit/test_handler.py`

---
 tests/unit/test_handler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/unit/test_handler.py b/tests/unit/test_handler.py
index 531ac5fc..2935d6e7 100644
--- a/tests/unit/test_handler.py
+++ b/tests/unit/test_handler.py
@@ -1,7 +1,7 @@
 import tempfile
+from typing import Dict
 
 import pytest
-from typing import Dict
 from transformers.testing_utils import require_tf, require_torch
 
 from huggingface_inference_toolkit.handler import (
@@ -12,7 +12,6 @@
     _is_gpu_available,
     _load_repository_from_hf,
 )
-from huggingface_inference_toolkit.logging import logger
 
 TASK = "text-classification"
 MODEL = "hf-internal-testing/tiny-random-distilbert"

From d764e4489f589a3ec2e45f396d23f7865ec49a08 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Mon, 18 Nov 2024 11:30:25 +0100
Subject: [PATCH 17/26] Make `parameters` default to empty dict instead of None

---
 src/huggingface_inference_toolkit/handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 617ca789..85d6124a 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -116,7 +116,7 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         if "instances" not in data:
             raise ValueError("The request body must contain a key 'instances' with a list of instances.")
-        parameters = data.pop("parameters", None)
+        parameters = data.pop("parameters", {})
 
         predictions = []
         # iterate over all instances and make predictions

From 5fbe5afe1f499c3f6a73513981f7a662ba47f5fb Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:25:54 +0100
Subject: [PATCH 18/26] Add note on `token-classification` / `ner` task

Apparently the parameters are indeed supported via the `__call__` method
of the `TokenClassificationPipeline` even if the docs say otherwise,
since those are internally provided to the `_sanitize_parameters`
function and then used within the `__call__` method instead of via the
`__init__`
---
 src/huggingface_inference_toolkit/handler.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 85d6124a..64fac4bd 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -62,12 +62,10 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                 )
 
         if self.pipeline.task in {"token-classification", "ner"}:
-            # stride and aggregation_strategy are defined on `pipeline` init, but in the Inference API those
-            # are provided on each request instead
-            for p in {"stride", "aggregation_strategy"}:
-                if p in parameters:
-                    parameters.pop(p)
-                    logger.warning(f"provided parameter `{p}`, but it's not supported.")
+            # even though the parameters `stride`, `aggregation_strategy` and `ignore_labels` are not explicitly
+            # defined within the `transformers.TokenClassificationPipeline.__call__` method, those are indeed
+            # supported and can be used on both the Inferencen API and Inference Endpoints
+            pass
 
         if self.pipeline.task.__contains__("translation"):
             # truncation and generate_parameters are used on Inference API but not available on

From 21ab873913ffd31f2a82db0296f19e43e0351fa9 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Mon, 2 Dec 2024 15:27:52 +0100
Subject: [PATCH 19/26] Update `version` in `setup.py`

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 793e89a7..6b386750 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 # We don't declare our dependency on transformers here because we build with
 # different packages for different variants
 
-VERSION = "0.5.2"
+VERSION = "0.5.3"
 
 # Ubuntu packages
 # libsndfile1-dev: torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. On Ubuntu it can be installed as follows: apt install libsndfile1-dev

From 7a225e2567950aa37dc88e0dc1fc7a448e25bcb1 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 3 Dec 2024 18:13:07 +0100
Subject: [PATCH 20/26] Fix `generate_kwargs` payload handling for
 text2text-based tasks

---
 src/huggingface_inference_toolkit/handler.py | 22 +++++++-------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 64fac4bd..39cf9fcf 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -3,7 +3,6 @@
 from typing import Any, Dict, Literal, Optional, Union
 
 from huggingface_inference_toolkit.const import HF_TRUST_REMOTE_CODE
-from huggingface_inference_toolkit.logging import logger
 from huggingface_inference_toolkit.sentence_transformers_utils import SENTENCE_TRANSFORMERS_TASKS
 from huggingface_inference_toolkit.utils import (
     check_and_register_custom_pipeline_from_directory,
@@ -61,19 +60,14 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                     "either `question` or `query`."
                 )
 
-        if self.pipeline.task in {"token-classification", "ner"}:
-            # even though the parameters `stride`, `aggregation_strategy` and `ignore_labels` are not explicitly
-            # defined within the `transformers.TokenClassificationPipeline.__call__` method, those are indeed
-            # supported and can be used on both the Inferencen API and Inference Endpoints
-            pass
-
-        if self.pipeline.task.__contains__("translation"):
-            # truncation and generate_parameters are used on Inference API but not available on
-            # `TranslationPipeline.__call__` method
-            for p in {"truncation", "generate_parameters"}:
-                if p in parameters:
-                    parameters.pop(p)
-                    logger.warning(f"provided parameter `{p}`, but it's not supported.")
+        if self.pipeline.task.__contains__("translation") or self.pipeline.task in {"text-generation"}:
+            # eventually `transformers` will update it to be named `generation_parameters`, in the meantime, in
+            # the current version pinned, it's not supported yet; and is still named `generate_kwargs`
+            # and the Inference API is using `generate_parameters`
+            if "generation_parameters" in parameters:
+                parameters["generate_kwargs"] = parameters.pop("generation_parameters")
+            if "generate_parameters" in parameters:
+                parameters["generate_kwargs"] = parameters.pop("generate_parameters")
 
         if self.pipeline.task.__contains__("zero-shot-classification"):
             if "candidateLabels" in parameters:

From cd9ebe786be9d81cc8a93c925be92c01cbb6edaf Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 3 Dec 2024 21:10:03 +0100
Subject: [PATCH 21/26] Fix `generate_kwargs` handling to move to flatten
 first-level dict
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Célina <hanouticelina@users.noreply.github.com>
---
 src/huggingface_inference_toolkit/handler.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 39cf9fcf..6496f5d4 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -68,6 +68,10 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                 parameters["generate_kwargs"] = parameters.pop("generation_parameters")
             if "generate_parameters" in parameters:
                 parameters["generate_kwargs"] = parameters.pop("generate_parameters")
+            generate_kwargs = parameters.pop("generate_kwargs", {})
+            # flatten the values of `generate_kwargs` as it's not supported as is, but via top-level parameters
+            for key, value in generate_kwargs.items():
+                parameters[key] = value
 
         if self.pipeline.task.__contains__("zero-shot-classification"):
             if "candidateLabels" in parameters:

From 280101d22537f387047b149da1393ac70b21c9d7 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Tue, 3 Dec 2024 21:41:47 +0100
Subject: [PATCH 22/26] Update `generate_kwargs` handling as sometimes required

---
 src/huggingface_inference_toolkit/handler.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index 6496f5d4..c421df13 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -60,16 +60,21 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
                     "either `question` or `query`."
                 )
 
-        if self.pipeline.task.__contains__("translation") or self.pipeline.task in {"text-generation"}:
-            # eventually `transformers` will update it to be named `generation_parameters`, in the meantime, in
-            # the current version pinned, it's not supported yet; and is still named `generate_kwargs`
-            # and the Inference API is using `generate_parameters`
+        if self.pipeline.task.__contains__("translation") or self.pipeline.task in {
+            "text-generation",
+            "image-to-text",
+            "automatic-speech-recognition",
+            "text-to-audio",
+        }:
+            # `generate_kwargs` needs to be a dict, `generation_parameters` or `generate` are not valid names
             if "generation_parameters" in parameters:
                 parameters["generate_kwargs"] = parameters.pop("generation_parameters")
-            if "generate_parameters" in parameters:
-                parameters["generate_kwargs"] = parameters.pop("generate_parameters")
-            generate_kwargs = parameters.pop("generate_kwargs", {})
+            if "generate" in parameters:
+                parameters["generate_kwargs"] = parameters.pop("generate")
+
+        if self.pipeline.task.__contains__("translation") or self.pipeline.task in {"text-generation"}:
             # flatten the values of `generate_kwargs` as it's not supported as is, but via top-level parameters
+            generate_kwargs = parameters.pop("generate_kwargs", {})
             for key, value in generate_kwargs.items():
                 parameters[key] = value
 

From 42cd8524113d19275ac6f9e3c61f68feed3e67f4 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Wed, 4 Dec 2024 18:14:34 +0100
Subject: [PATCH 23/26] Remove `generate` from supported generation kwargs key
 names

---
 src/huggingface_inference_toolkit/handler.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/huggingface_inference_toolkit/handler.py b/src/huggingface_inference_toolkit/handler.py
index c421df13..66f4acdc 100644
--- a/src/huggingface_inference_toolkit/handler.py
+++ b/src/huggingface_inference_toolkit/handler.py
@@ -65,12 +65,11 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
             "image-to-text",
             "automatic-speech-recognition",
             "text-to-audio",
+            "text-to-speech",
         }:
-            # `generate_kwargs` needs to be a dict, `generation_parameters` or `generate` are not valid names
+            # `generate_kwargs` needs to be a dict, `generation_parameters` is here for forward compatibility
             if "generation_parameters" in parameters:
                 parameters["generate_kwargs"] = parameters.pop("generation_parameters")
-            if "generate" in parameters:
-                parameters["generate_kwargs"] = parameters.pop("generate")
 
         if self.pipeline.task.__contains__("translation") or self.pipeline.task in {"text-generation"}:
             # flatten the values of `generate_kwargs` as it's not supported as is, but via top-level parameters

From 01cd7a8cce7c00434d436a87df3a256a86c024e9 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Wed, 4 Dec 2024 18:15:09 +0100
Subject: [PATCH 24/26] Update `SentenceRankingPipeline` to handle
 `query`-`texts` pipelines

Also adds some extra validation steps
---
 .../sentence_transformers_utils.py            | 40 ++++++++++++++++---
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/src/huggingface_inference_toolkit/sentence_transformers_utils.py b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
index 91eda151..0b345a2c 100644
--- a/src/huggingface_inference_toolkit/sentence_transformers_utils.py
+++ b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
@@ -34,22 +34,52 @@ def __call__(self, sentences: Union[str, List[str]]) -> Dict[str, List[float]]:
         return {"embeddings": embeddings}
 
 
-class RankingPipeline:
+class SentenceRankingPipeline:
     def __init__(self, model_dir: str, device: Union[str, None] = None, **kwargs: Any) -> None:
         # `device` needs to be set to "cuda" for GPU
         self.model = CrossEncoder(model_dir, device=device, **kwargs)
 
     def __call__(
-        self, sentences: Union[Tuple[str, str], List[str], List[List[str]], List[Tuple[str, str]]]
-    ) -> Dict[str, List[float]]:
-        scores = self.model.predict(sentences).tolist()
+        self,
+        sentences: Union[Tuple[str, str], List[str], List[List[str]], List[Tuple[str, str]], None] = None,
+        query: Union[str, None] = None,
+        texts: Union[List[str], None] = None,
+        return_documents: bool = False,
+    ) -> Dict[str, List[Any]]:
+        if all(x is not None for x in [sentences, query, texts]):
+            raise ValueError(
+                f"The provided payload contains {sentences=} (i.e. 'inputs'), {query=}, and {texts=}"
+                " but all of those cannot be provided, you should provide either only 'sentences' i.e. 'inputs'"
+                " of both 'query' and 'texts' to run the ranking task."
+            )
+
+        if all(x is None for x in [sentences, query, texts]):
+            raise ValueError(
+                "No inputs have been provided within the input payload, make sure that the input payload"
+                " contains either 'sentences' i.e. 'inputs', or both 'query' and 'texts' to run the ranking task."
+            )
+
+        if sentences is not None:
+            scores = self.model.predict(sentences).tolist()
+            return {"scores": scores}
+
+        if query is None or not isinstance(query, str):
+            raise ValueError(f"Provided {query=} but a non-empty string should be provided instead.")
+
+        if texts is None or not isinstance(texts, list) or not all(isinstance(text, str) for text in texts):
+            raise ValueError(f"Provided {texts=}, but a list of non-empty strings should be provided instead.")
+
+        scores = self.model.rank(query, texts, return_documents=return_documents)
+        # rename "corpus_id" key to "index" for all scores to match TEI
+        for score in scores:
+            score["index"] = score.pop("corpus_id")  # type: ignore
         return {"scores": scores}
 
 
 SENTENCE_TRANSFORMERS_TASKS = {
     "sentence-similarity": SentenceSimilarityPipeline,
     "sentence-embeddings": SentenceEmbeddingPipeline,
-    "sentence-ranking": RankingPipeline,
+    "sentence-ranking": SentenceRankingPipeline,
 }
 
 

From 4ffcdfd9efa7c8823ca03ff4e86b6afbfa449ca7 Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Wed, 4 Dec 2024 18:35:24 +0100
Subject: [PATCH 25/26] Update typing and fix `sentence-transformers` tests

---
 .../sentence_transformers_utils.py            |  9 ++-
 tests/unit/test_sentence_transformers.py      | 72 +++++++++++++++++--
 2 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/src/huggingface_inference_toolkit/sentence_transformers_utils.py b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
index 0b345a2c..0d648420 100644
--- a/src/huggingface_inference_toolkit/sentence_transformers_utils.py
+++ b/src/huggingface_inference_toolkit/sentence_transformers_utils.py
@@ -1,6 +1,11 @@
 import importlib.util
 from typing import Any, Dict, List, Tuple, Union
 
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
 _sentence_transformers = importlib.util.find_spec("sentence_transformers") is not None
 
 
@@ -45,7 +50,7 @@ def __call__(
         query: Union[str, None] = None,
         texts: Union[List[str], None] = None,
         return_documents: bool = False,
-    ) -> Dict[str, List[Any]]:
+    ) -> Union[Dict[str, List[float]], List[Dict[Literal["index", "score", "text"], Any]]]:
         if all(x is not None for x in [sentences, query, texts]):
             raise ValueError(
                 f"The provided payload contains {sentences=} (i.e. 'inputs'), {query=}, and {texts=}"
@@ -73,7 +78,7 @@ def __call__(
         # rename "corpus_id" key to "index" for all scores to match TEI
         for score in scores:
             score["index"] = score.pop("corpus_id")  # type: ignore
-        return {"scores": scores}
+        return scores  # type: ignore
 
 
 SENTENCE_TRANSFORMERS_TASKS = {
diff --git a/tests/unit/test_sentence_transformers.py b/tests/unit/test_sentence_transformers.py
index 23c08e58..e48533bc 100644
--- a/tests/unit/test_sentence_transformers.py
+++ b/tests/unit/test_sentence_transformers.py
@@ -1,5 +1,6 @@
 import tempfile
 
+import pytest
 from transformers.testing_utils import require_torch
 
 from huggingface_inference_toolkit.sentence_transformers_utils import (
@@ -25,8 +26,11 @@ def test_sentence_embedding_task():
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf("sentence-transformers/all-MiniLM-L6-v2", tmpdirname)
         pipe = get_sentence_transformers_pipeline("sentence-embeddings", storage_dir.as_posix())
-        res = pipe("Lets create an embedding")
+        res = pipe(sentences="Lets create an embedding")
         assert isinstance(res["embeddings"], list)
+        res = pipe(sentences=["Lets create an embedding", "Lets create another embedding"])
+        assert isinstance(res["embeddings"], list)
+        assert len(res["embeddings"]) == 2
 
 
 @require_torch
@@ -34,7 +38,7 @@ def test_sentence_similarity():
     with tempfile.TemporaryDirectory() as tmpdirname:
         storage_dir = _load_repository_from_hf("sentence-transformers/all-MiniLM-L6-v2", tmpdirname)
         pipe = get_sentence_transformers_pipeline("sentence-similarity", storage_dir.as_posix())
-        res = pipe(**{"source_sentence": "Lets create an embedding", "sentences": ["Lets create an embedding"]})
+        res = pipe(source_sentence="Lets create an embedding", sentences=["Lets create an embedding"])
         assert isinstance(res["similarities"], list)
 
 
@@ -44,13 +48,67 @@ def test_sentence_ranking():
         storage_dir = _load_repository_from_hf("cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname)
         pipe = get_sentence_transformers_pipeline("sentence-ranking", storage_dir.as_posix())
         res = pipe(
-            [
-                ["Lets create an embedding", "Lets create an embedding"],
-                ["Lets create an embedding", "Lets create an embedding"],
+            sentences=[
+                ["Lets create an embedding", "Lets create another embedding"],
+                ["Lets create an embedding", "Lets create another embedding"],
             ]
         )
         assert isinstance(res["scores"], list)
+        res = pipe(sentences=["Lets create an embedding", "Lets create an embedding"])
+        assert isinstance(res["scores"], float)
+
+
+@require_torch
+def test_sentence_ranking_tei():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        storage_dir = _load_repository_from_hf("cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch")
+        pipe = get_sentence_transformers_pipeline("sentence-ranking", storage_dir.as_posix())
         res = pipe(
-            ["Lets create an embedding", "Lets create an embedding"],
+            query="Lets create an embedding",
+            texts=["Lets create an embedding", "I like noodles"],
         )
-        assert isinstance(res["scores"], float)
+        assert isinstance(res, list)
+        assert all(r.keys() == {"index", "score"} for r in res)
+
+        res = pipe(
+            query="Lets create an embedding",
+            texts=["Lets create an embedding", "I like noodles"],
+            return_documents=True,
+        )
+        assert isinstance(res, list)
+        assert all(r.keys() == {"index", "score", "text"} for r in res)
+
+
+@require_torch
+def test_sentence_ranking_validation_errors():
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        storage_dir = _load_repository_from_hf("cross-encoder/ms-marco-MiniLM-L-6-v2", tmpdirname, framework="pytorch")
+        pipe = get_sentence_transformers_pipeline("sentence-ranking", storage_dir.as_posix())
+
+        with pytest.raises(
+            ValueError,
+            match=(
+                "you should provide either only 'sentences' i.e. 'inputs' "
+                "of both 'query' and 'texts' to run the ranking task."
+            ),
+        ):
+            pipe(
+                sentences="Lets create an embedding",
+                query="Lets create an embedding",
+                texts=["Lets create an embedding", "I like noodles"],
+            )
+
+        with pytest.raises(
+            ValueError,
+            match=(
+                "No inputs have been provided within the input payload, make sure that the input payload "
+                "contains either 'sentences' i.e. 'inputs', or both 'query' and 'texts'"
+            ),
+        ):
+            pipe(sentences=None, query=None, texts=None)
+
+        with pytest.raises(
+            ValueError,
+            match=("Provided texts=None, but a list of non-empty strings should be provided instead."),
+        ):
+            pipe(sentences=None, query="Lets create an embedding", texts=None)

From 9d87331b2b79e6c2e8ae6f7c7e1239f720efeb4f Mon Sep 17 00:00:00 2001
From: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
Date: Thu, 12 Dec 2024 13:49:53 +0100
Subject: [PATCH 26/26] Upgrade `transformers`, `sentence-transformers` and
 `peft` dependencies

---
 setup.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 6b386750..7d8c4bb6 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 # libavcodec-extra : libavcodec-extra  includes additional codecs for ffmpeg
 
 install_requires = [
-    "transformers[sklearn,sentencepiece,audio,vision]==4.46.1",
+    "transformers[sklearn,sentencepiece,audio,vision]==4.47.0",
     "huggingface_hub[hf_transfer]==0.26.2",
     # vision
     "Pillow",
@@ -31,11 +31,11 @@
 
 extras = {}
 
-extras["st"] = ["sentence_transformers==3.2.1"]
-extras["diffusers"] = ["diffusers==0.31.0", "accelerate==1.0.1"]
+extras["st"] = ["sentence_transformers==3.3.1"]
+extras["diffusers"] = ["diffusers==0.31.0", "accelerate==1.1.0"]
 # Includes `peft` as PEFT requires `torch` so having `peft` as a core dependency
 # means that `torch` will be installed even if the `torch` extra is not specified.
-extras["torch"] = ["torch==2.3.1", "torchvision", "torchaudio", "peft==0.13.2"]
+extras["torch"] = ["torch==2.3.1", "torchvision", "torchaudio", "peft==0.14.0"]
 extras["test"] = [
     "pytest==7.2.1",
     "pytest-xdist",