Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Text-To-Speech pipeline #24952

Merged
merged 37 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
dc69e3a
add AutoModelForTextToSpeech class
ylacombe Jul 19, 2023
45bb44c
add TTS pipeline and tessting
ylacombe Jul 20, 2023
7fb8260
add docstrings to text_to_speech pipeline
ylacombe Jul 20, 2023
b028383
fix torch dependency
ylacombe Jul 20, 2023
59aa1f5
corrector 'processor is None' case in Pipeline
ylacombe Jul 20, 2023
7360555
correct repo id
ylacombe Jul 20, 2023
c2eae25
Merge branch 'huggingface:main' into add_tts_pipeline
ylacombe Jul 21, 2023
57ea0b3
modify text-to-speech -> text-to-audio
ylacombe Jul 21, 2023
17d618c
remove processor
ylacombe Jul 21, 2023
6c80743
rename text_to_speech pipelines files to text_audio
ylacombe Jul 21, 2023
b43a1f2
add textToWaveform and textToSpectrogram instead of textToAudio classes
ylacombe Jul 25, 2023
01abc20
update TTS pipeline to the bare minimum
ylacombe Aug 1, 2023
7f96a84
update tests TTS pipeline
ylacombe Aug 1, 2023
d6f0013
make style and erase useless import torch in TTS pipeline tests
ylacombe Aug 1, 2023
7e14127
modify how to check if generate or forward in TTS pipeline
ylacombe Aug 3, 2023
71f9948
remove unnecessary extra new lines
ylacombe Aug 10, 2023
72142ea
Apply suggestions from code review
ylacombe Aug 10, 2023
c8d8a7b
refactor input_texts -> text_inputs
ylacombe Aug 10, 2023
59e9249
correct docstrings of TTS.__call__
ylacombe Aug 10, 2023
29f7dfb
correct the shape of generated waveform
ylacombe Aug 10, 2023
c642d5f
take care of Bark tokenizer special case
ylacombe Aug 10, 2023
df8dd60
correct run_pipeline_test TTS
ylacombe Aug 10, 2023
a93f19c
make style
ylacombe Aug 10, 2023
9c5ffc0
update TTS docstrings
ylacombe Aug 10, 2023
179e5c2
address Sylvain nit refactors
ylacombe Aug 10, 2023
a0de465
make style
ylacombe Aug 10, 2023
f6f3702
refactor into one liners
ylacombe Aug 10, 2023
71b7832
Merge branch 'huggingface:main' into add_tts_pipeline
ylacombe Aug 11, 2023
62a53a9
Merge branch 'huggingface:main' into add_tts_pipeline
ylacombe Aug 11, 2023
a67632a
correct squeeze
ylacombe Aug 11, 2023
9948d5b
correct way to test if forward or generate
ylacombe Aug 11, 2023
2d7154a
Update output audio waveform shape
ylacombe Aug 11, 2023
9ec61fa
make style
ylacombe Aug 11, 2023
c906e42
Merge branch 'huggingface:main' into add_tts_pipeline
ylacombe Aug 14, 2023
72d606f
correct import
ylacombe Aug 14, 2023
ed7ad07
modify how the TTS pipeline test if a model can generate
ylacombe Aug 14, 2023
0c5d4de
align shape output of TTS pipeline with consistent shape
ylacombe Aug 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/en/main_classes/pipelines.md
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,13 @@ Pipelines available for audio tasks include the following.
- __call__
- all

### TextToSpeechPipeline

[[autodoc]] TextToSpeechPipeline
- __call__
- all


### ZeroShotAudioClassificationPipeline

[[autodoc]] ZeroShotAudioClassificationPipeline
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,6 +641,7 @@
"Text2TextGenerationPipeline",
"TextClassificationPipeline",
"TextGenerationPipeline",
"TextToSpeechPipeline",
"TokenClassificationPipeline",
"TranslationPipeline",
"VideoClassificationPipeline",
Expand Down Expand Up @@ -1092,6 +1093,7 @@
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"MODEL_FOR_TEXT_ENCODING_MAPPING",
"MODEL_FOR_TEXT_TO_SPEECH_MAPPING",
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
Expand Down Expand Up @@ -4566,6 +4568,7 @@
Text2TextGenerationPipeline,
TextClassificationPipeline,
TextGenerationPipeline,
TextToSpeechPipeline,
TokenClassificationPipeline,
TranslationPipeline,
VideoClassificationPipeline,
Expand Down Expand Up @@ -4965,6 +4968,7 @@
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_TEXT_ENCODING_MAPPING,
MODEL_FOR_TEXT_TO_SPEECH_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
Expand Down
2 changes: 2 additions & 0 deletions src/transformers/models/auto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
"MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
"MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"MODEL_FOR_TEXT_ENCODING_MAPPING",
"MODEL_FOR_TEXT_TO_SPEECH_MAPPING",
"MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
Expand Down Expand Up @@ -241,6 +242,7 @@
MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
MODEL_FOR_TEXT_ENCODING_MAPPING,
MODEL_FOR_TEXT_TO_SPEECH_MAPPING,
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
Expand Down
14 changes: 14 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,14 @@
]
)

MODEL_FOR_TEXT_TO_SPEECH_MAPPING_NAMES = OrderedDict(
[
# Model for Text-To-Speech mapping
("bark", "BarkModel"),
("speecht5", "SpeechT5ForTextToSpeech"),
]
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could add MusicGen if its processor is added to PROCESSOR_MAPPING.

MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Zero Shot Image Classification mapping
Expand Down Expand Up @@ -1140,6 +1148,8 @@
)
MODEL_FOR_AUDIO_XVECTOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES)

MODEL_FOR_TEXT_TO_SPEECH_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_TO_SPEECH_MAPPING_NAMES)

MODEL_FOR_BACKBONE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_BACKBONE_MAPPING_NAMES)

MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
Expand Down Expand Up @@ -1395,6 +1405,10 @@ class AutoModelForAudioXVector(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_AUDIO_XVECTOR_MAPPING


class AutoModelForTextToSpeech(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_TEXT_TO_SPEECH_MAPPING


class AutoBackbone(_BaseAutoBackboneClass):
_model_mapping = MODEL_FOR_BACKBONE_MAPPING

Expand Down
58 changes: 58 additions & 0 deletions src/transformers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
from ..models.auto.modeling_auto import AutoModelForDepthEstimation
from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
from ..processing_utils import ProcessorMixin
from ..tokenization_utils import PreTrainedTokenizer
from ..utils import (
HUGGINGFACE_CO_RESOLVE_ENDPOINT,
Expand Down Expand Up @@ -70,6 +72,7 @@
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
from .text_classification import TextClassificationPipeline
from .text_generation import TextGenerationPipeline
from .text_to_speech import TextToSpeechPipeline
from .token_classification import (
AggregationStrategy,
NerPipeline,
Expand Down Expand Up @@ -133,6 +136,7 @@
AutoModelForSequenceClassification,
AutoModelForSpeechSeq2Seq,
AutoModelForTableQuestionAnswering,
AutoModelForTextToSpeech,
AutoModelForTokenClassification,
AutoModelForVideoClassification,
AutoModelForVision2Seq,
Expand All @@ -156,6 +160,7 @@
"sentiment-analysis": "text-classification",
"ner": "token-classification",
"vqa": "visual-question-answering",
"text-to-audio": "text-to-speech",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pipeline can also be used as a text-to-audio pipeline!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would actually put it the other way, text-to-audio is more general which we tend to prefer.

That way when the audio procuced becomes music it still works.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@osanseviero FYI do we already have a task name for this ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will put it the other way around!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So far we've said text-to-speech (https://huggingface.co/models?pipeline_tag=text-to-speech&sort=trending), we don't have a more general task name

}
SUPPORTED_TASKS = {
"audio-classification": {
Expand All @@ -172,6 +177,13 @@
"default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "55bb623")}},
"type": "multimodal",
},
"text-to-speech": {
"impl": TextToSpeechPipeline,
"tf": (),
"pt": (AutoModelForTextToSpeech,) if is_torch_available() else (),
"default": {"model": {"pt": ("suno/bark-small", "645cfba")}},
"type": "processor",
},
"feature-extraction": {
"impl": FeatureExtractionPipeline,
"tf": (TFAutoModel,) if is_tf_available() else (),
Expand Down Expand Up @@ -398,6 +410,7 @@
NO_FEATURE_EXTRACTOR_TASKS = set()
NO_IMAGE_PROCESSOR_TASKS = set()
NO_TOKENIZER_TASKS = set()
NO_PROCESSOR_TASKS = set()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
NO_PROCESSOR_TASKS = set()

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pipelines are never using processor. And it cannot.
Processor are handy tools, but they just do too much and have no stable API, you can send text, images, audio and whatnot, with no possibility for the caller to know what is the correct contract (and inspecting the signature is kind of a sin).

FeatureExtractor and Tokenizer should be enough I think.

Copy link
Contributor

@sanchit-gandhi sanchit-gandhi Jul 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FeatureExtractor and Tokenizer should be enough I think

The Tokenizer is required for all models to pre-process the text inputs to input ids. The FeatureExtractor allows us to convert an audio prompt to a log-mel spectrogram, so is only used for particular models (not Bark for instance).

Neither prepare the official speaker embeddings, which is only done with the processor (as discussed in the Bark PR)

So the design hurdle here is figuring out how we can prepare the speaker embeddings within the pipeline without the processor class. I feel quite strongly that the pipeline should be able to handle any speaker embeddings internally - if the user has to prepare the speaker embeddings outside of the pipeline, the complexity of using the API is more or less the same as using the model + processor, so there's no real point of switching to using the pipeline

# Those model configs are special, they are generic over their task, meaning
# any tokenizer/feature_extractor might be use for a given model so we cannot
# use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to
Expand All @@ -407,11 +420,18 @@
if values["type"] == "text":
NO_FEATURE_EXTRACTOR_TASKS.add(task)
NO_IMAGE_PROCESSOR_TASKS.add(task)
NO_PROCESSOR_TASKS.add(task)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
NO_PROCESSOR_TASKS.add(task)

elif values["type"] in {"image", "video"}:
NO_TOKENIZER_TASKS.add(task)
NO_PROCESSOR_TASKS.add(task)
elif values["type"] in {"audio"}:
NO_TOKENIZER_TASKS.add(task)
NO_IMAGE_PROCESSOR_TASKS.add(task)
NO_PROCESSOR_TASKS.add(task)
elif values["type"] in {"processor"}:
NO_FEATURE_EXTRACTOR_TASKS.add(task)
NO_IMAGE_PROCESSOR_TASKS.add(task)
NO_TOKENIZER_TASKS.add(task)
elif values["type"] != "multimodal":
raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")

Expand Down Expand Up @@ -468,6 +488,7 @@ def check_task(task: str) -> Tuple[str, Dict, Any]:
- `"text2text-generation"`
- `"text-classification"` (alias `"sentiment-analysis"` available)
- `"text-generation"`
- `"text-to-speech"` (alias `"text-to-audio"` available)
- `"token-classification"` (alias `"ner"` available)
- `"translation"`
- `"translation_xx_to_yy"`
Expand Down Expand Up @@ -510,6 +531,7 @@ def pipeline(
tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
image_processor: Optional[Union[str, BaseImageProcessor]] = None,
processor: Optional[Union[str, ProcessorMixin]] = None,
framework: Optional[str] = None,
revision: Optional[str] = None,
use_fast: bool = True,
Expand Down Expand Up @@ -554,6 +576,7 @@ def pipeline(
- `"text-classification"` (alias `"sentiment-analysis"` available): will return a
[`TextClassificationPipeline`].
- `"text-generation"`: will return a [`TextGenerationPipeline`]:.
- `"text-to-speech"` (alias `"text-to-audio"` available): will return a [`TextToSpeechPipeline`]:.
- `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
- `"translation"`: will return a [`TranslationPipeline`].
- `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
Expand Down Expand Up @@ -800,6 +823,7 @@ def pipeline(
load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None


# If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
# `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
Expand Down Expand Up @@ -855,6 +879,8 @@ def pipeline(
load_feature_extractor = False
if task in NO_IMAGE_PROCESSOR_TASKS:
load_image_processor = False
if task in NO_PROCESSOR_TASKS:
load_processor = False

if load_tokenizer:
# Try to infer tokenizer from model or config name (if provided as str)
Expand Down Expand Up @@ -960,6 +986,35 @@ def pipeline(
if not is_pyctcdecode_available():
logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")

if load_processor:
# Try to infer processor from model or config name (if provided as str)
if processor is None:
if isinstance(model_name, str):
processor = model_name
elif isinstance(config, str):
processor = config
else:
# Impossible to guess what is the right tokenizer here
raise Exception(
"Impossible to guess which processor to use. "
"Please provide a ProcessorMixin class or a path/identifier to a pretrained processor."
)

# Instantiate tokenizer if needed
if isinstance(processor, (str, tuple)):
if isinstance(processor, tuple):
# For tuple we have (tokenizer name, {kwargs})
processor_identifier = processor[0]
processor_kwargs = processor[1]
else:
processor_identifier = processor
processor_kwargs = model_kwargs.copy()
processor_kwargs.pop("torch_dtype", None)

processor = AutoProcessor.from_pretrained(
processor_identifier, _from_pipeline=task, **hub_kwargs, **processor_kwargs
)

if task == "translation" and model.config.task_specific_params:
for key in model.config.task_specific_params:
if key.startswith("translation"):
Expand All @@ -982,6 +1037,9 @@ def pipeline(
if image_processor is not None:
kwargs["image_processor"] = image_processor

if processor is not None:
kwargs["processor"] = processor

if device is not None:
kwargs["device"] = device

Expand Down
15 changes: 14 additions & 1 deletion src/transformers/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,9 @@ def save_pretrained(self, save_directory: str, safe_serialization: bool = False)
if self.feature_extractor is not None:
self.feature_extractor.save_pretrained(save_directory)

if self.processor is not None:
self.processor.save_pretrained(save_directory)

if self.modelcard is not None:
self.modelcard.save_pretrained(save_directory)

Expand Down Expand Up @@ -1048,9 +1051,19 @@ def get_iterator(
if "TOKENIZERS_PARALLELISM" not in os.environ:
logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer = self.tokenizer

# TODO hack by collating feature_extractor and image_processor
feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)

# TODO hack if processor exists by using the processor tokenizer and/or feature extractor
if feature_extractor is None and self.processor.feature_extractor_class is not None:
feature_extractor = self.processor.feature_extractor
if tokenizer is None and self.processor.tokenizer_class is not None:
tokenizer = self.processor.tokenizer

collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(tokenizer, feature_extractor)
dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
Expand Down
Loading