Skip to content

Commit

Permalink
Merge branch 'main' of github.com:huggingface/huggingface_hub into hf…
Browse files Browse the repository at this point in the history
…fs-url
  • Loading branch information
mariosasko committed Feb 14, 2024
2 parents 582ebff + 0c272d5 commit ab935a4
Show file tree
Hide file tree
Showing 7 changed files with 1,833 additions and 3 deletions.
1 change: 1 addition & 0 deletions docs/source/en/guides/inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ has a simple API that supports the most common tasks. Here is a list of the curr
| Domain | Task | Supported | Documentation |
|--------|--------------------------------|--------------|------------------------------------|
| Audio | [Audio Classification](https://huggingface.co/tasks/audio-classification) || [`~InferenceClient.audio_classification`] |
| Audio | [Audio-to-Audio](https://huggingface.co/tasks/audio-to-audio) || [`~InferenceClient.audio_to_audio`] |
| | [Automatic Speech Recognition](https://huggingface.co/tasks/automatic-speech-recognition) || [`~InferenceClient.automatic_speech_recognition`] |
| | [Text-to-Speech](https://huggingface.co/tasks/text-to-speech) || [`~InferenceClient.text_to_speech`] |
| Computer Vision | [Image Classification](https://huggingface.co/tasks/image-classification) || [`~InferenceClient.image_classification`] |
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,11 @@ exclude = [
"dist",
"venv",
]
line-length = 119
# Ignored rules:
# "E501" -> line length violation
ignore = ["E501"]
line-length = 119
select = ["E", "F", "I", "W"]
lint.ignore = ["E501"]
lint.select = ["E", "F", "I", "W"]

[tool.ruff.lint.isort]
known-first-party = ["huggingface_hub"]
Expand Down
45 changes: 45 additions & 0 deletions src/huggingface_hub/inference/_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
# - Images are parsed as PIL.Image for easier manipulation.
# - Provides a "recommended model" for each task => suboptimal but user-wise quicker to get a first script running.
# - Only the main parameters are publicly exposed. Power users can always read the docs for more options.
import base64
import logging
import time
import warnings
Expand Down Expand Up @@ -78,6 +79,7 @@
raise_text_generation_error,
)
from huggingface_hub.inference._types import (
AudioToAudioOutput,
ClassificationOutput,
ConversationalOutput,
FillMaskOutput,
Expand Down Expand Up @@ -299,6 +301,49 @@ def audio_classification(
response = self.post(data=audio, model=model, task="audio-classification")
return _bytes_to_list(response)

def audio_to_audio(
self,
audio: ContentT,
*,
model: Optional[str] = None,
) -> List[AudioToAudioOutput]:
"""
Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
Args:
audio (Union[str, Path, bytes, BinaryIO]):
The audio content for the model. It can be raw audio bytes, a local audio file, or a URL pointing to an
audio file.
model (`str`, *optional*):
The model can be any model which takes an audio file and returns another audio file. Can be a model ID hosted on the Hugging Face Hub
or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
audio_to_audio will be used.
Returns:
`List[Dict]`: A list of dictionary where each index contains audios label, content-type, and audio content in blob.
Raises:
`InferenceTimeoutError`:
If the model is unavailable or the request times out.
`HTTPError`:
If the request fails with an HTTP error status code other than HTTP 503.
Example:
```py
>>> from huggingface_hub import InferenceClient
>>> client = InferenceClient()
>>> audio_output = client.audio_to_audio("audio.flac")
>>> for i, item in enumerate(audio_output):
>>> with open(f"output_{i}.flac", "wb") as f:
f.write(item["blob"])
```
"""
response = self.post(data=audio, model=model, task="audio-to-audio")
audio_output = _bytes_to_list(response)
for item in audio_output:
item["blob"] = base64.b64decode(item["blob"])
return audio_output

def automatic_speech_recognition(
self,
audio: ContentT,
Expand Down
46 changes: 46 additions & 0 deletions src/huggingface_hub/inference/_generated/_async_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# To re-generate the code, run `make style` or `python ./utils/generate_async_inference_client.py --update`.
# WARNING
import asyncio
import base64
import logging
import time
import warnings
Expand Down Expand Up @@ -63,6 +64,7 @@
raise_text_generation_error,
)
from huggingface_hub.inference._types import (
AudioToAudioOutput,
ClassificationOutput,
ConversationalOutput,
FillMaskOutput,
Expand Down Expand Up @@ -295,6 +297,50 @@ async def audio_classification(
response = await self.post(data=audio, model=model, task="audio-classification")
return _bytes_to_list(response)

async def audio_to_audio(
self,
audio: ContentT,
*,
model: Optional[str] = None,
) -> List[AudioToAudioOutput]:
"""
Performs multiple tasks related to audio-to-audio depending on the model (eg: speech enhancement, source separation).
Args:
audio (Union[str, Path, bytes, BinaryIO]):
The audio content for the model. It can be raw audio bytes, a local audio file, or a URL pointing to an
audio file.
model (`str`, *optional*):
The model can be any model which takes an audio file and returns another audio file. Can be a model ID hosted on the Hugging Face Hub
or a URL to a deployed Inference Endpoint. If not provided, the default recommended model for
audio_to_audio will be used.
Returns:
`List[Dict]`: A list of dictionary where each index contains audios label, content-type, and audio content in blob.
Raises:
`InferenceTimeoutError`:
If the model is unavailable or the request times out.
`aiohttp.ClientResponseError`:
If the request fails with an HTTP error status code other than HTTP 503.
Example:
```py
# Must be run in an async context
>>> from huggingface_hub import AsyncInferenceClient
>>> client = AsyncInferenceClient()
>>> audio_output = await client.audio_to_audio("audio.flac")
>>> async for i, item in enumerate(audio_output):
>>> with open(f"output_{i}.flac", "wb") as f:
f.write(item["blob"])
```
"""
response = await self.post(data=audio, model=model, task="audio-to-audio")
audio_output = _bytes_to_list(response)
for item in audio_output:
item["blob"] = base64.b64decode(item["blob"])
return audio_output

async def automatic_speech_recognition(
self,
audio: ContentT,
Expand Down
17 changes: 17 additions & 0 deletions src/huggingface_hub/inference/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,23 @@
from PIL import Image


class AudioToAudioOutput(TypedDict):
"""Dictionary containing the output of a [`~InferenceClient.audio_to_audio`] task.
Args:
label (`str`):
The label of the audio file.
content-type (`str`):
The content type of audio file.
blob (`bytes`):
The audio file in byte format.
"""

label: str
content_type: str
blob: bytes


class ClassificationOutput(TypedDict):
"""Dictionary containing the output of a [`~InferenceClient.audio_classification`] and [`~InferenceClient.image_classification`] task.
Expand Down
Loading

0 comments on commit ab935a4

Please sign in to comment.