diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 9fecd6c27a89eb..dbace650489bb4 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -305,6 +305,11 @@
 
 NO_FEATURE_EXTRACTOR_TASKS = set()
 NO_TOKENIZER_TASKS = set()
+# Those model configs are special, they are generic over their task, meaning
+# any tokenizer/feature_extractor might be use for a given model so we cannot
+# use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to
+# see if the model defines such objects or not.
+MULTI_MODEL_CONFIGS = {"VisionTextDualEncoderConfig", "SpeechEncoderDecoderConfig"}
 for task, values in SUPPORTED_TASKS.items():
     if values["type"] == "text":
         NO_FEATURE_EXTRACTOR_TASKS.add(task)
@@ -374,8 +379,9 @@ def check_task(task: str) -> Tuple[Dict, Any]:
             - `"zero-shot-image-classification"`
 
     Returns:
-        (task_defaults`dict`, task_options: (`tuple`, None)) The actual dictionary required to initialize the pipeline
-        and some extra task options for parametrized tasks like "translation_XX_to_YY"
+        (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
+        (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
+        options for parametrized tasks like "translation_XX_to_YY"
 
 
     """
@@ -608,7 +614,7 @@ def pipeline(
                 model, module_file + ".py", class_name, revision=revision, use_auth_token=use_auth_token
             )
     else:
-        targeted_task, task_options = check_task(task)
+        normalized_task, targeted_task, task_options = check_task(task)
         if pipeline_class is None:
             pipeline_class = targeted_task["impl"]
 
@@ -661,12 +667,36 @@ def pipeline(
     load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
 
+    if (
+        tokenizer is None
+        and not load_tokenizer
+        and normalized_task not in NO_TOKENIZER_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_CONFIGS
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_tokenizer = True
+    if (
+        feature_extractor is None
+        and not load_feature_extractor
+        and normalized_task not in NO_FEATURE_EXTRACTOR_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_CONFIGS
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_feature_extractor = True
+
     if task in NO_TOKENIZER_TASKS:
         # These will never require a tokenizer.
         # the model on the other hand might have a tokenizer, but
         # the files could be missing from the hub, instead of failing
         # on such repos, we just force to not load it.
         load_tokenizer = False
+
     if task in NO_FEATURE_EXTRACTOR_TASKS:
         load_feature_extractor = False
 
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 29a12e7df22a30..7006ff2c00a2dd 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -630,7 +630,6 @@ def __iter__(self):
         for line in sys.stdin:
             # Split for multi-columns
             if "\t" in line:
-
                 line = line.split("\t")
                 if self.column:
                     # Dictionary to map arguments
@@ -752,7 +751,6 @@ def __init__(
         binary_output: bool = False,
         **kwargs,
     ):
-
         if framework is None:
             framework, model = infer_framework_load_model(model, config=model.config)
 
@@ -1121,18 +1119,19 @@ def get_supported_tasks(self) -> List[str]:
         supported_task.sort()
         return supported_task
 
-    def check_task(self, task: str) -> Tuple[Dict, Any]:
+    def check_task(self, task: str) -> Tuple[str, Dict, Any]:
         if task in self.task_aliases:
             task = self.task_aliases[task]
         if task in self.supported_tasks:
             targeted_task = self.supported_tasks[task]
-            return targeted_task, None
+            return task, targeted_task, None
 
         if task.startswith("translation"):
             tokens = task.split("_")
             if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
                 targeted_task = self.supported_tasks["translation"]
-                return targeted_task, (tokens[1], tokens[3])
+                task = "translation"
+                return task, targeted_task, (tokens[1], tokens[3])
             raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
 
         raise KeyError(
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 25bf520eafb4d7..0523639cc4fe85 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -141,15 +141,8 @@ def test_small_model_pt(self):
 
     @require_torch
     def test_small_model_pt_seq2seq(self):
-        model_id = "hf-internal-testing/tiny-random-speech-encoder-decoder"
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        feature_extractor = AutoFeatureExtractor.from_pretrained(model_id)
-
         speech_recognizer = pipeline(
-            task="automatic-speech-recognition",
-            model=model_id,
-            tokenizer=tokenizer,
-            feature_extractor=feature_extractor,
+            model="hf-internal-testing/tiny-random-speech-encoder-decoder",
             framework="pt",
         )