Merge branch 'main' of github.com:huggingface/huggingface_hub into hf…

…fs-url
huggingface · Feb 14, 2024 · ab935a4 · ab935a4
2 parents 582ebff + 0c272d5
commit ab935a4
Show file tree

Hide file tree

Showing 7 changed files with 1,833 additions and 3 deletions.
diff --git a/docs/source/en/guides/inference.md b/docs/source/en/guides/inference.md
@@ -121,6 +121,7 @@ has a simple API that supports the most common tasks. Here is a list of the curr
 | Domain | Task                           | Supported    | Documentation                             |
 |--------|--------------------------------|--------------|------------------------------------|
 | Audio | [Audio Classification](https://huggingface.co/tasks/audio-classification)           | ✅ | [`~InferenceClient.audio_classification`] |
+| Audio | [Audio-to-Audio](https://huggingface.co/tasks/audio-to-audio)           | ✅ | [`~InferenceClient.audio_to_audio`] |
 | | [Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition)   | ✅ | [`~InferenceClient.automatic_speech_recognition`] |
 | | [Text-to-Speech](https://huggingface.co/tasks/text-to-speech)                 | ✅ | [`~InferenceClient.text_to_speech`] |
 | Computer Vision | [Image Classification](https://huggingface.co/tasks/image-classification)           | ✅ | [`~InferenceClient.image_classification`] |

diff --git a/pyproject.toml b/pyproject.toml
@@ -40,11 +40,11 @@ exclude = [
     "dist",
     "venv",
 ]
+line-length = 119
 # Ignored rules:
 # "E501" -> line length violation
-ignore = ["E501"]
-line-length = 119
-select = ["E", "F", "I", "W"]
+lint.ignore = ["E501"]
+lint.select = ["E", "F", "I", "W"]
 
 [tool.ruff.lint.isort]
 known-first-party = ["huggingface_hub"]

diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -33,6 +33,7 @@
 # - Images are parsed as PIL.Image for easier manipulation.
 # - Provides a "recommended model" for each task => suboptimal but user-wise quicker to get a first script running.
 # - Only the main parameters are publicly exposed. Power users can always read the docs for more options.
+import base64
 import logging
 import time
 import warnings
@@ -78,6 +79,7 @@
     raise_text_generation_error,
 )
 from huggingface_hub.inference._types import (
+    AudioToAudioOutput,
     ClassificationOutput,
     ConversationalOutput,
     FillMaskOutput,
@@ -299,6 +301,49 @@ def audio_classification(
         response = self.post(data=audio, model=model, task="audio-classification")
         return _bytes_to_list(response)
 
+    def audio_to_audio(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+    ) -> List[AudioToAudioOutput]:
+        """
+        Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The audio content for the model. It can be raw audio bytes, a local audio file, or a URL pointing to an
+                audio file.
+            model (`str`, *optional*):
+                The model can be any model which takes an audio file and returns another audio file. Can be a model ID hosted on the Hugging Face Hub
+                or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
+                audio_to_audio will be used.
+
+        Returns:
+            `List[Dict]`: A list of dictionary where each index contains audios label, content-type, and audio content in blob.
+
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `HTTPError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        >>> from huggingface_hub import InferenceClient
+        >>> client = InferenceClient()
+        >>> audio_output = client.audio_to_audio("audio.flac")
+        >>> for i, item in enumerate(audio_output):
+        >>>     with open(f"output_{i}.flac", "wb") as f:
+                    f.write(item["blob"])
+        ```
+        """
+        response = self.post(data=audio, model=model, task="audio-to-audio")
+        audio_output = _bytes_to_list(response)
+        for item in audio_output:
+            item["blob"] = base64.b64decode(item["blob"])
+        return audio_output
+
     def automatic_speech_recognition(
         self,
         audio: ContentT,

diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -19,6 +19,7 @@
 # To re-generate the code, run `make style` or `python ./utils/generate_async_inference_client.py --update`.
 # WARNING
 import asyncio
+import base64
 import logging
 import time
 import warnings
@@ -63,6 +64,7 @@
     raise_text_generation_error,
 )
 from huggingface_hub.inference._types import (
+    AudioToAudioOutput,
     ClassificationOutput,
     ConversationalOutput,
     FillMaskOutput,
@@ -295,6 +297,50 @@ async def audio_classification(
         response = await self.post(data=audio, model=model, task="audio-classification")
         return _bytes_to_list(response)
 
+    async def audio_to_audio(
+        self,
+        audio: ContentT,
+        *,
+        model: Optional[str] = None,
+    ) -> List[AudioToAudioOutput]:
+        """
+        Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
+
+        Args:
+            audio (Union[str, Path, bytes, BinaryIO]):
+                The audio content for the model. It can be raw audio bytes, a local audio file, or a URL pointing to an
+                audio file.
+            model (`str`, *optional*):
+                The model can be any model which takes an audio file and returns another audio file. Can be a model ID hosted on the Hugging Face Hub
+                or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
+                audio_to_audio will be used.
+
+        Returns:
+            `List[Dict]`: A list of dictionary where each index contains audios label, content-type, and audio content in blob.
+
+        Raises:
+            `InferenceTimeoutError`:
+                If the model is unavailable or the request times out.
+            `aiohttp.ClientResponseError`:
+                If the request fails with an HTTP error status code other than HTTP 503.
+
+        Example:
+        ```py
+        # Must be run in an async context
+        >>> from huggingface_hub import AsyncInferenceClient
+        >>> client = AsyncInferenceClient()
+        >>> audio_output = await client.audio_to_audio("audio.flac")
+        >>> async for i, item in enumerate(audio_output):
+        >>>     with open(f"output_{i}.flac", "wb") as f:
+                    f.write(item["blob"])
+        ```
+        """
+        response = await self.post(data=audio, model=model, task="audio-to-audio")
+        audio_output = _bytes_to_list(response)
+        for item in audio_output:
+            item["blob"] = base64.b64decode(item["blob"])
+        return audio_output
+
     async def automatic_speech_recognition(
         self,
         audio: ContentT,

diff --git a/src/huggingface_hub/inference/_types.py b/src/huggingface_hub/inference/_types.py
@@ -19,6 +19,23 @@
     from PIL import Image
 
 
+class AudioToAudioOutput(TypedDict):
+    """Dictionary containing the output of a [`~InferenceClient.audio_to_audio`] task.
+
+    Args:
+        label (`str`):
+            The label of the audio file.
+        content-type (`str`):
+            The content type of audio file.
+        blob (`bytes`):
+            The audio file in byte format.
+    """
+
+    label: str
+    content_type: str
+    blob: bytes
+
+
 class ClassificationOutput(TypedDict):
     """Dictionary containing the output of a [`~InferenceClient.audio_classification`] and  [`~InferenceClient.image_classification`] task.