diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py
index 68ea928bce5672..057910098da20a 100644
--- a/src/transformers/pipelines/video_classification.py
+++ b/src/transformers/pipelines/video_classification.py
@@ -1,3 +1,17 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
 from io import BytesIO
 from typing import List, Union
 
@@ -42,7 +56,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, "av")
         self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
 
-    def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None):
+    def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None):
         preprocess_params = {}
         if frame_sampling_rate is not None:
             preprocess_params["frame_sampling_rate"] = frame_sampling_rate
@@ -52,14 +66,23 @@ def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=
         postprocess_params = {}
         if top_k is not None:
             postprocess_params["top_k"] = top_k
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
         return preprocess_params, {}, postprocess_params
 
-    def __call__(self, videos: Union[str, List[str]], **kwargs):
+    def __call__(self, inputs: Union[str, List[str]] = None, **kwargs):
         """
         Assign labels to the video(s) passed as inputs.
 
         Args:
-            videos (`str`, `List[str]`):
+            inputs (`str`, `List[str]`):
                 The pipeline handles three types of videos:
 
                 - A string containing a http link pointing to a video
@@ -76,6 +99,11 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
             frame_sampling_rate (`int`, *optional*, defaults to 1):
                 The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
                 frame will be used.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
 
         Return:
             A dictionary or a list of dictionaries containing result. If the input is a single video, will return a
@@ -87,7 +115,16 @@ def __call__(self, videos: Union[str, List[str]], **kwargs):
             - **label** (`str`) -- The label identified by the model.
             - **score** (`int`) -- The score attributed by the model for that label.
         """
-        return super().__call__(videos, **kwargs)
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "videos" in kwargs:
+            warnings.warn(
+                "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted",
+                FutureWarning,
+            )
+            inputs = kwargs.pop("videos")
+        if inputs is None:
+            raise ValueError("Cannot call the video-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
 
     def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
         if num_frames is None:
@@ -114,12 +151,17 @@ def _forward(self, model_inputs):
         model_outputs = self.model(**model_inputs)
         return model_outputs
 
-    def postprocess(self, model_outputs, top_k=5):
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
         if top_k > self.model.config.num_labels:
             top_k = self.model.config.num_labels
 
         if self.framework == "pt":
-            probs = model_outputs.logits.softmax(-1)[0]
+            if function_to_apply == "softmax":
+                probs = model_outputs.logits[0].softmax(-1)
+            elif function_to_apply == "sigmoid":
+                probs = model_outputs.logits[0].sigmoid()
+            else:
+                probs = model_outputs.logits[0]
             scores, ids = probs.topk(top_k)
         else:
             raise ValueError(f"Unsupported framework: {self.framework}")
diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py
index 8b910e94af3b47..f1ed97ac13df1a 100644
--- a/tests/pipelines/test_pipelines_video_classification.py
+++ b/tests/pipelines/test_pipelines_video_classification.py
@@ -14,11 +14,12 @@
 
 import unittest
 
-from huggingface_hub import hf_hub_download
+from huggingface_hub import VideoClassificationOutputElement, hf_hub_download
 
 from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor
 from transformers.pipelines import VideoClassificationPipeline, pipeline
 from transformers.testing_utils import (
+    compare_pipeline_output_to_hub_spec,
     is_pipeline_test,
     nested_simplify,
     require_av,
@@ -76,6 +77,8 @@ def run_pipeline_test(self, video_classifier, examples):
                     {"score": ANY(float), "label": ANY(str)},
                 ],
             )
+            for element in outputs:
+                compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
 
     @require_torch
     def test_small_model_pt(self):
@@ -93,6 +96,9 @@ def test_small_model_pt(self):
             nested_simplify(outputs, decimals=4),
             [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
         )
+        for output in outputs:
+            for element in output:
+                compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
 
         outputs = video_classifier(
             [
@@ -108,6 +114,9 @@ def test_small_model_pt(self):
                 [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}],
             ],
         )
+        for output in outputs:
+            for element in output:
+                compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement)
 
     @require_tf
     @unittest.skip
diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py
index 74bc1b8669a702..fe8a197237291a 100644
--- a/tests/test_pipeline_mixin.py
+++ b/tests/test_pipeline_mixin.py
@@ -34,6 +34,7 @@
     ImageToTextInput,
     ObjectDetectionInput,
     QuestionAnsweringInput,
+    VideoClassificationInput,
     ZeroShotImageClassificationInput,
 )
 
@@ -47,6 +48,7 @@
     ImageToTextPipeline,
     ObjectDetectionPipeline,
     QuestionAnsweringPipeline,
+    VideoClassificationPipeline,
     ZeroShotImageClassificationPipeline,
 )
 from transformers.testing_utils import (
@@ -132,6 +134,7 @@
     "image-to-text": (ImageToTextPipeline, ImageToTextInput),
     "object-detection": (ObjectDetectionPipeline, ObjectDetectionInput),
     "question-answering": (QuestionAnsweringPipeline, QuestionAnsweringInput),
+    "video-classification": (VideoClassificationPipeline, VideoClassificationInput),
     "zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput),
 }