huggingface · sanchit-gandhi · Aug 17, 2023 · Jul 19, 2023 · Jul 20, 2023 · Jul 20, 2023
diff --git a/docs/source/en/main_classes/pipelines.md b/docs/source/en/main_classes/pipelines.md
@@ -318,6 +318,13 @@ Pipelines available for audio tasks include the following.
     - __call__
     - all
 
+### TextToSpeechPipeline
+
+[[autodoc]] TextToSpeechPipeline
+    - __call__
+    - all
+
+
 ### ZeroShotAudioClassificationPipeline
 
 [[autodoc]] ZeroShotAudioClassificationPipeline

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -641,6 +641,7 @@
         "Text2TextGenerationPipeline",
         "TextClassificationPipeline",
         "TextGenerationPipeline",
+        "TextToSpeechPipeline",
         "TokenClassificationPipeline",
         "TranslationPipeline",
         "VideoClassificationPipeline",
@@ -1092,6 +1093,7 @@
             "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_TEXT_ENCODING_MAPPING",
+            "MODEL_FOR_TEXT_TO_SPEECH_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
             "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
@@ -4566,6 +4568,7 @@
         Text2TextGenerationPipeline,
         TextClassificationPipeline,
         TextGenerationPipeline,
+        TextToSpeechPipeline,
         TokenClassificationPipeline,
         TranslationPipeline,
         VideoClassificationPipeline,
@@ -4965,6 +4968,7 @@
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TEXT_ENCODING_MAPPING,
+            MODEL_FOR_TEXT_TO_SPEECH_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,

diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
@@ -65,6 +65,7 @@
         "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
         "MODEL_FOR_TEXT_ENCODING_MAPPING",
+        "MODEL_FOR_TEXT_TO_SPEECH_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
         "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
         "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
@@ -241,6 +242,7 @@
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TEXT_ENCODING_MAPPING,
+            MODEL_FOR_TEXT_TO_SPEECH_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,

diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
@@ -1002,6 +1002,14 @@
     ]
 )
 
+MODEL_FOR_TEXT_TO_SPEECH_MAPPING_NAMES = OrderedDict(
+    [
+        # Model for Text-To-Speech mapping
+        ("bark", "BarkModel"),
+        ("speecht5", "SpeechT5ForTextToSpeech"),
+    ]
+)
+
 MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Zero Shot Image Classification mapping
@@ -1140,6 +1148,8 @@
 )
 MODEL_FOR_AUDIO_XVECTOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES)
 
+MODEL_FOR_TEXT_TO_SPEECH_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_TO_SPEECH_MAPPING_NAMES)
+
 MODEL_FOR_BACKBONE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_BACKBONE_MAPPING_NAMES)
 
 MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
@@ -1395,6 +1405,10 @@ class AutoModelForAudioXVector(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_AUDIO_XVECTOR_MAPPING
 
 
+class AutoModelForTextToSpeech(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_TEXT_TO_SPEECH_MAPPING
+
+
 class AutoBackbone(_BaseAutoBackboneClass):
     _model_mapping = MODEL_FOR_BACKBONE_MAPPING
 

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
@@ -30,7 +30,9 @@
 from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
 from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
 from ..models.auto.modeling_auto import AutoModelForDepthEstimation
+from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from ..processing_utils import ProcessorMixin
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import (
     HUGGINGFACE_CO_RESOLVE_ENDPOINT,
@@ -70,6 +72,7 @@
 from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
 from .text_classification import TextClassificationPipeline
 from .text_generation import TextGenerationPipeline
+from .text_to_speech import TextToSpeechPipeline
 from .token_classification import (
     AggregationStrategy,
     NerPipeline,
@@ -133,6 +136,7 @@
         AutoModelForSequenceClassification,
         AutoModelForSpeechSeq2Seq,
         AutoModelForTableQuestionAnswering,
+        AutoModelForTextToSpeech,
         AutoModelForTokenClassification,
         AutoModelForVideoClassification,
         AutoModelForVision2Seq,
@@ -156,6 +160,7 @@
     "sentiment-analysis": "text-classification",
     "ner": "token-classification",
     "vqa": "visual-question-answering",
+    "text-to-audio": "text-to-speech",
 }
 SUPPORTED_TASKS = {
     "audio-classification": {
@@ -172,6 +177,13 @@
         "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "55bb623")}},
         "type": "multimodal",
     },
+    "text-to-speech": {
+        "impl": TextToSpeechPipeline,
+        "tf": (),
+        "pt": (AutoModelForTextToSpeech,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("suno/bark-small", "645cfba")}},
+        "type": "processor",
+    },
     "feature-extraction": {
         "impl": FeatureExtractionPipeline,
         "tf": (TFAutoModel,) if is_tf_available() else (),
@@ -398,6 +410,7 @@
 NO_FEATURE_EXTRACTOR_TASKS = set()
 NO_IMAGE_PROCESSOR_TASKS = set()
 NO_TOKENIZER_TASKS = set()
+NO_PROCESSOR_TASKS = set()
-NO_PROCESSOR_TASKS = set()
-NO_PROCESSOR_TASKS = set()
 # Those model configs are special, they are generic over their task, meaning
 # any tokenizer/feature_extractor might be use for a given model so we cannot
 # use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to
@@ -407,11 +420,18 @@
     if values["type"] == "text":
         NO_FEATURE_EXTRACTOR_TASKS.add(task)
         NO_IMAGE_PROCESSOR_TASKS.add(task)
+        NO_PROCESSOR_TASKS.add(task)
-        NO_PROCESSOR_TASKS.add(task)
-        NO_PROCESSOR_TASKS.add(task)
     elif values["type"] in {"image", "video"}:
         NO_TOKENIZER_TASKS.add(task)
+        NO_PROCESSOR_TASKS.add(task)
     elif values["type"] in {"audio"}:
         NO_TOKENIZER_TASKS.add(task)
         NO_IMAGE_PROCESSOR_TASKS.add(task)
+        NO_PROCESSOR_TASKS.add(task)
+    elif values["type"] in {"processor"}:
+        NO_FEATURE_EXTRACTOR_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
+        NO_TOKENIZER_TASKS.add(task)
     elif values["type"] != "multimodal":
         raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
 
@@ -468,6 +488,7 @@ def check_task(task: str) -> Tuple[str, Dict, Any]:
             - `"text2text-generation"`
             - `"text-classification"` (alias `"sentiment-analysis"` available)
             - `"text-generation"`
+            - `"text-to-speech"` (alias `"text-to-audio"` available)
             - `"token-classification"` (alias `"ner"` available)
             - `"translation"`
             - `"translation_xx_to_yy"`
@@ -510,6 +531,7 @@ def pipeline(
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     image_processor: Optional[Union[str, BaseImageProcessor]] = None,
+    processor: Optional[Union[str, ProcessorMixin]] = None,
     framework: Optional[str] = None,
     revision: Optional[str] = None,
     use_fast: bool = True,
@@ -554,6 +576,7 @@ def pipeline(
             - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
               [`TextClassificationPipeline`].
             - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+            - `"text-to-speech"` (alias `"text-to-audio"` available): will return a [`TextToSpeechPipeline`]:.
             - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
             - `"translation"`: will return a [`TranslationPipeline`].
             - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
@@ -800,6 +823,7 @@ def pipeline(
     load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
     load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
+    load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
-    load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
-    load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
 
     # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
     # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
@@ -855,6 +879,8 @@ def pipeline(
         load_feature_extractor = False
     if task in NO_IMAGE_PROCESSOR_TASKS:
         load_image_processor = False
+    if task in NO_PROCESSOR_TASKS:
+        load_processor = False
 
     if load_tokenizer:
         # Try to infer tokenizer from model or config name (if provided as str)
@@ -960,6 +986,35 @@ def pipeline(
                     if not is_pyctcdecode_available():
                         logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
 
+    if load_processor:
+        # Try to infer processor from model or config name (if provided as str)
+        if processor is None:
+            if isinstance(model_name, str):
+                processor = model_name
+            elif isinstance(config, str):
+                processor = config
+            else:
+                # Impossible to guess what is the right tokenizer here
+                raise Exception(
+                    "Impossible to guess which processor to use. "
+                    "Please provide a ProcessorMixin class or a path/identifier to a pretrained processor."
+                )
+
+        # Instantiate tokenizer if needed
+        if isinstance(processor, (str, tuple)):
+            if isinstance(processor, tuple):
+                # For tuple we have (tokenizer name, {kwargs})
+                processor_identifier = processor[0]
+                processor_kwargs = processor[1]
+            else:
+                processor_identifier = processor
+                processor_kwargs = model_kwargs.copy()
+                processor_kwargs.pop("torch_dtype", None)
+
+            processor = AutoProcessor.from_pretrained(
+                processor_identifier, _from_pipeline=task, **hub_kwargs, **processor_kwargs
+            )
+
     if task == "translation" and model.config.task_specific_params:
         for key in model.config.task_specific_params:
             if key.startswith("translation"):
@@ -982,6 +1037,9 @@ def pipeline(
     if image_processor is not None:
         kwargs["image_processor"] = image_processor
 
+    if processor is not None:
+        kwargs["processor"] = processor
+
     if device is not None:
         kwargs["device"] = device
 

diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
@@ -866,6 +866,9 @@ def save_pretrained(self, save_directory: str, safe_serialization: bool = False)
         if self.feature_extractor is not None:
             self.feature_extractor.save_pretrained(save_directory)
 
+        if self.processor is not None:
+            self.processor.save_pretrained(save_directory)
+
         if self.modelcard is not None:
             self.modelcard.save_pretrained(save_directory)
 
@@ -1048,9 +1051,19 @@ def get_iterator(
         if "TOKENIZERS_PARALLELISM" not in os.environ:
             logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        tokenizer = self.tokenizer
+
         # TODO hack by collating feature_extractor and image_processor
         feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
-        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+
+        # TODO hack if processor exists by using the processor tokenizer and/or feature extractor
+        if feature_extractor is None and self.processor.feature_extractor_class is not None:
+            feature_extractor = self.processor.feature_extractor
+        if tokenizer is None and self.processor.tokenizer_class is not None:
+            tokenizer = self.processor.tokenizer
+
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(tokenizer, feature_extractor)
         dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
         model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
         final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)