-
Notifications
You must be signed in to change notification settings - Fork 28.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Text-To-Speech pipeline #24952
Add Text-To-Speech pipeline #24952
Changes from 6 commits
dc69e3a
45bb44c
7fb8260
b028383
59aa1f5
7360555
c2eae25
57ea0b3
17d618c
6c80743
b43a1f2
01abc20
7f96a84
d6f0013
7e14127
71f9948
72142ea
c8d8a7b
59e9249
29f7dfb
c642d5f
df8dd60
a93f19c
9c5ffc0
179e5c2
a0de465
f6f3702
71b7832
62a53a9
a67632a
9948d5b
2d7154a
9ec61fa
c906e42
72d606f
ed7ad07
0c5d4de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -30,7 +30,9 @@ | |||
from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor | ||||
from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor | ||||
from ..models.auto.modeling_auto import AutoModelForDepthEstimation | ||||
from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor | ||||
from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer | ||||
from ..processing_utils import ProcessorMixin | ||||
from ..tokenization_utils import PreTrainedTokenizer | ||||
from ..utils import ( | ||||
HUGGINGFACE_CO_RESOLVE_ENDPOINT, | ||||
|
@@ -70,6 +72,7 @@ | |||
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline | ||||
from .text_classification import TextClassificationPipeline | ||||
from .text_generation import TextGenerationPipeline | ||||
from .text_to_speech import TextToSpeechPipeline | ||||
from .token_classification import ( | ||||
AggregationStrategy, | ||||
NerPipeline, | ||||
|
@@ -133,6 +136,7 @@ | |||
AutoModelForSequenceClassification, | ||||
AutoModelForSpeechSeq2Seq, | ||||
AutoModelForTableQuestionAnswering, | ||||
AutoModelForTextToSpeech, | ||||
AutoModelForTokenClassification, | ||||
AutoModelForVideoClassification, | ||||
AutoModelForVision2Seq, | ||||
|
@@ -156,6 +160,7 @@ | |||
"sentiment-analysis": "text-classification", | ||||
"ner": "token-classification", | ||||
"vqa": "visual-question-answering", | ||||
"text-to-audio": "text-to-speech", | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The pipeline can also be used as a text-to-audio pipeline! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would actually put it the other way, That way when the audio procuced becomes music it still works. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @osanseviero FYI do we already have a task name for this ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will put it the other way around! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So far we've said |
||||
} | ||||
SUPPORTED_TASKS = { | ||||
"audio-classification": { | ||||
|
@@ -172,6 +177,13 @@ | |||
"default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "55bb623")}}, | ||||
"type": "multimodal", | ||||
}, | ||||
"text-to-speech": { | ||||
"impl": TextToSpeechPipeline, | ||||
"tf": (), | ||||
"pt": (AutoModelForTextToSpeech,) if is_torch_available() else (), | ||||
"default": {"model": {"pt": ("suno/bark-small", "645cfba")}}, | ||||
"type": "processor", | ||||
}, | ||||
"feature-extraction": { | ||||
"impl": FeatureExtractionPipeline, | ||||
"tf": (TFAutoModel,) if is_tf_available() else (), | ||||
|
@@ -398,6 +410,7 @@ | |||
NO_FEATURE_EXTRACTOR_TASKS = set() | ||||
NO_IMAGE_PROCESSOR_TASKS = set() | ||||
NO_TOKENIZER_TASKS = set() | ||||
NO_PROCESSOR_TASKS = set() | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pipelines are never using
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The Neither prepare the official speaker embeddings, which is only done with the So the design hurdle here is figuring out how we can prepare the speaker embeddings within the pipeline without the |
||||
# Those model configs are special, they are generic over their task, meaning | ||||
# any tokenizer/feature_extractor might be use for a given model so we cannot | ||||
# use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to | ||||
|
@@ -407,11 +420,18 @@ | |||
if values["type"] == "text": | ||||
NO_FEATURE_EXTRACTOR_TASKS.add(task) | ||||
NO_IMAGE_PROCESSOR_TASKS.add(task) | ||||
NO_PROCESSOR_TASKS.add(task) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
elif values["type"] in {"image", "video"}: | ||||
NO_TOKENIZER_TASKS.add(task) | ||||
NO_PROCESSOR_TASKS.add(task) | ||||
elif values["type"] in {"audio"}: | ||||
NO_TOKENIZER_TASKS.add(task) | ||||
NO_IMAGE_PROCESSOR_TASKS.add(task) | ||||
NO_PROCESSOR_TASKS.add(task) | ||||
elif values["type"] in {"processor"}: | ||||
NO_FEATURE_EXTRACTOR_TASKS.add(task) | ||||
NO_IMAGE_PROCESSOR_TASKS.add(task) | ||||
NO_TOKENIZER_TASKS.add(task) | ||||
elif values["type"] != "multimodal": | ||||
raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}") | ||||
|
||||
|
@@ -468,6 +488,7 @@ def check_task(task: str) -> Tuple[str, Dict, Any]: | |||
- `"text2text-generation"` | ||||
- `"text-classification"` (alias `"sentiment-analysis"` available) | ||||
- `"text-generation"` | ||||
- `"text-to-speech"` (alias `"text-to-audio"` available) | ||||
- `"token-classification"` (alias `"ner"` available) | ||||
- `"translation"` | ||||
- `"translation_xx_to_yy"` | ||||
|
@@ -510,6 +531,7 @@ def pipeline( | |||
tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, | ||||
feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, | ||||
image_processor: Optional[Union[str, BaseImageProcessor]] = None, | ||||
processor: Optional[Union[str, ProcessorMixin]] = None, | ||||
framework: Optional[str] = None, | ||||
revision: Optional[str] = None, | ||||
use_fast: bool = True, | ||||
|
@@ -554,6 +576,7 @@ def pipeline( | |||
- `"text-classification"` (alias `"sentiment-analysis"` available): will return a | ||||
[`TextClassificationPipeline`]. | ||||
- `"text-generation"`: will return a [`TextGenerationPipeline`]:. | ||||
- `"text-to-speech"` (alias `"text-to-audio"` available): will return a [`TextToSpeechPipeline`]:. | ||||
- `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`]. | ||||
- `"translation"`: will return a [`TranslationPipeline`]. | ||||
- `"translation_xx_to_yy"`: will return a [`TranslationPipeline`]. | ||||
|
@@ -800,6 +823,7 @@ def pipeline( | |||
load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None | ||||
load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None | ||||
load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None | ||||
load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||
|
||||
# If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while | ||||
# `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some | ||||
|
@@ -855,6 +879,8 @@ def pipeline( | |||
load_feature_extractor = False | ||||
if task in NO_IMAGE_PROCESSOR_TASKS: | ||||
load_image_processor = False | ||||
if task in NO_PROCESSOR_TASKS: | ||||
load_processor = False | ||||
|
||||
if load_tokenizer: | ||||
# Try to infer tokenizer from model or config name (if provided as str) | ||||
|
@@ -960,6 +986,35 @@ def pipeline( | |||
if not is_pyctcdecode_available(): | ||||
logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode") | ||||
|
||||
if load_processor: | ||||
# Try to infer processor from model or config name (if provided as str) | ||||
if processor is None: | ||||
if isinstance(model_name, str): | ||||
processor = model_name | ||||
elif isinstance(config, str): | ||||
processor = config | ||||
else: | ||||
# Impossible to guess what is the right tokenizer here | ||||
raise Exception( | ||||
"Impossible to guess which processor to use. " | ||||
"Please provide a ProcessorMixin class or a path/identifier to a pretrained processor." | ||||
) | ||||
|
||||
# Instantiate tokenizer if needed | ||||
if isinstance(processor, (str, tuple)): | ||||
if isinstance(processor, tuple): | ||||
# For tuple we have (tokenizer name, {kwargs}) | ||||
processor_identifier = processor[0] | ||||
processor_kwargs = processor[1] | ||||
else: | ||||
processor_identifier = processor | ||||
processor_kwargs = model_kwargs.copy() | ||||
processor_kwargs.pop("torch_dtype", None) | ||||
|
||||
processor = AutoProcessor.from_pretrained( | ||||
processor_identifier, _from_pipeline=task, **hub_kwargs, **processor_kwargs | ||||
) | ||||
|
||||
if task == "translation" and model.config.task_specific_params: | ||||
for key in model.config.task_specific_params: | ||||
if key.startswith("translation"): | ||||
|
@@ -982,6 +1037,9 @@ def pipeline( | |||
if image_processor is not None: | ||||
kwargs["image_processor"] = image_processor | ||||
|
||||
if processor is not None: | ||||
kwargs["processor"] = processor | ||||
|
||||
if device is not None: | ||||
kwargs["device"] = device | ||||
|
||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could add
MusicGen
if its processor is added toPROCESSOR_MAPPING
.