diff --git a/src/transformers/pipelines/video_classification.py b/src/transformers/pipelines/video_classification.py index 68ea928bce5672..057910098da20a 100644 --- a/src/transformers/pipelines/video_classification.py +++ b/src/transformers/pipelines/video_classification.py @@ -1,3 +1,17 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings from io import BytesIO from typing import List, Union @@ -42,7 +56,7 @@ def __init__(self, *args, **kwargs): requires_backends(self, "av") self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES) - def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None): + def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None): preprocess_params = {} if frame_sampling_rate is not None: preprocess_params["frame_sampling_rate"] = frame_sampling_rate @@ -52,14 +66,23 @@ def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate= postprocess_params = {} if top_k is not None: postprocess_params["top_k"] = top_k + if function_to_apply is not None: + if function_to_apply not in ["softmax", "sigmoid", "none"]: + raise ValueError( + f"Invalid value for `function_to_apply`: {function_to_apply}. " + "Valid options are ['softmax', 'sigmoid', 'none']" + ) + postprocess_params["function_to_apply"] = function_to_apply + else: + postprocess_params["function_to_apply"] = "softmax" return preprocess_params, {}, postprocess_params - def __call__(self, videos: Union[str, List[str]], **kwargs): + def __call__(self, inputs: Union[str, List[str]] = None, **kwargs): """ Assign labels to the video(s) passed as inputs. Args: - videos (`str`, `List[str]`): + inputs (`str`, `List[str]`): The pipeline handles three types of videos: - A string containing a http link pointing to a video @@ -76,6 +99,11 @@ def __call__(self, videos: Union[str, List[str]], **kwargs): frame_sampling_rate (`int`, *optional*, defaults to 1): The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every frame will be used. + function_to_apply(`str`, *optional*, defaults to "softmax"): + The function to apply to the model output. By default, the pipeline will apply the softmax function to + the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's + built-in `None` will default to "softmax", so you need to pass the string "none" to disable any + post-processing. Return: A dictionary or a list of dictionaries containing result. If the input is a single video, will return a @@ -87,7 +115,16 @@ def __call__(self, videos: Union[str, List[str]], **kwargs): - **label** (`str`) -- The label identified by the model. - **score** (`int`) -- The score attributed by the model for that label. """ - return super().__call__(videos, **kwargs) + # After deprecation of this is completed, remove the default `None` value for `images` + if "videos" in kwargs: + warnings.warn( + "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted", + FutureWarning, + ) + inputs = kwargs.pop("videos") + if inputs is None: + raise ValueError("Cannot call the video-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) def preprocess(self, video, num_frames=None, frame_sampling_rate=1): if num_frames is None: @@ -114,12 +151,17 @@ def _forward(self, model_inputs): model_outputs = self.model(**model_inputs) return model_outputs - def postprocess(self, model_outputs, top_k=5): + def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"): if top_k > self.model.config.num_labels: top_k = self.model.config.num_labels if self.framework == "pt": - probs = model_outputs.logits.softmax(-1)[0] + if function_to_apply == "softmax": + probs = model_outputs.logits[0].softmax(-1) + elif function_to_apply == "sigmoid": + probs = model_outputs.logits[0].sigmoid() + else: + probs = model_outputs.logits[0] scores, ids = probs.topk(top_k) else: raise ValueError(f"Unsupported framework: {self.framework}") diff --git a/tests/pipelines/test_pipelines_video_classification.py b/tests/pipelines/test_pipelines_video_classification.py index 8b910e94af3b47..f1ed97ac13df1a 100644 --- a/tests/pipelines/test_pipelines_video_classification.py +++ b/tests/pipelines/test_pipelines_video_classification.py @@ -14,11 +14,12 @@ import unittest -from huggingface_hub import hf_hub_download +from huggingface_hub import VideoClassificationOutputElement, hf_hub_download from transformers import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, VideoMAEFeatureExtractor from transformers.pipelines import VideoClassificationPipeline, pipeline from transformers.testing_utils import ( + compare_pipeline_output_to_hub_spec, is_pipeline_test, nested_simplify, require_av, @@ -76,6 +77,8 @@ def run_pipeline_test(self, video_classifier, examples): {"score": ANY(float), "label": ANY(str)}, ], ) + for element in outputs: + compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement) @require_torch def test_small_model_pt(self): @@ -93,6 +96,9 @@ def test_small_model_pt(self): nested_simplify(outputs, decimals=4), [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}], ) + for output in outputs: + for element in output: + compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement) outputs = video_classifier( [ @@ -108,6 +114,9 @@ def test_small_model_pt(self): [{"score": 0.5199, "label": "LABEL_0"}, {"score": 0.4801, "label": "LABEL_1"}], ], ) + for output in outputs: + for element in output: + compare_pipeline_output_to_hub_spec(element, VideoClassificationOutputElement) @require_tf @unittest.skip diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index 74bc1b8669a702..fe8a197237291a 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -34,6 +34,7 @@ ImageToTextInput, ObjectDetectionInput, QuestionAnsweringInput, + VideoClassificationInput, ZeroShotImageClassificationInput, ) @@ -47,6 +48,7 @@ ImageToTextPipeline, ObjectDetectionPipeline, QuestionAnsweringPipeline, + VideoClassificationPipeline, ZeroShotImageClassificationPipeline, ) from transformers.testing_utils import ( @@ -132,6 +134,7 @@ "image-to-text": (ImageToTextPipeline, ImageToTextInput), "object-detection": (ObjectDetectionPipeline, ObjectDetectionInput), "question-answering": (QuestionAnsweringPipeline, QuestionAnsweringInput), + "video-classification": (VideoClassificationPipeline, VideoClassificationInput), "zero-shot-image-classification": (ZeroShotImageClassificationPipeline, ZeroShotImageClassificationInput), }