diff --git a/src/backend/marsha/core/management/commands/transcript_video.py b/src/backend/marsha/core/management/commands/transcript_video.py new file mode 100644 index 0000000000..d7393b54ed --- /dev/null +++ b/src/backend/marsha/core/management/commands/transcript_video.py @@ -0,0 +1,54 @@ +"""Management command to transcript a video.""" + +from django.core.management import BaseCommand + +from marsha.core import defaults +from marsha.core.models import TimedTextTrack, Video +from marsha.core.utils.transcript_utils import transcript + + +class Command(BaseCommand): + """Transcript a video.""" + + help = "Transcript a video" + + def add_arguments(self, parser): + parser.add_argument("--video-id", type=str) + + def handle(self, *args, **options): + """Selects a video to transcript and starts the transcription job.""" + video_id = options["video_id"] + if video_id: + try: + video = Video.objects.get(id=video_id) + except Video.DoesNotExist: + self.stdout.write(f"No video matches the provided id: {video_id}") + return + + if video.upload_state != defaults.READY: + self.stdout.write(f"Video {video_id} is not ready") + return + + if video.timedtexttracks.filter(mode=TimedTextTrack.TRANSCRIPT).exists(): + self.stdout.write(f"Transcript already exists for video {video_id}") + return + else: + excluded_timed_text_tracks = TimedTextTrack.objects.filter( + mode=TimedTextTrack.TRANSCRIPT + ) + video = ( + Video.objects.exclude(timedtexttracks__in=excluded_timed_text_tracks) + .filter(upload_state=defaults.READY) + .order_by("-created_on") + .first() + ) + if not video: + self.stdout.write("No video to transcript") + return + + try: + self.stdout.write(f"Try to transcript video {video.id}") + transcript(video) + self.stdout.write(f"Transcription job started for video {video.id}") + except Exception as e: # pylint: disable=broad-except + self.stderr.write(f"Error: {e}") diff --git a/src/backend/marsha/core/tests/management_commands/test_transcript_videos.py b/src/backend/marsha/core/tests/management_commands/test_transcript_videos.py new file mode 100644 index 0000000000..fe5176ff95 --- /dev/null +++ b/src/backend/marsha/core/tests/management_commands/test_transcript_videos.py @@ -0,0 +1,196 @@ +"""Test transcript_video command.""" + +from io import StringIO +from unittest.mock import patch + +from django.core.management import call_command +from django.test import TestCase + +from marsha.core import defaults +from marsha.core.factories import TimedTextTrackFactory, VideoFactory +from marsha.core.management.commands import transcript_video +from marsha.core.models import TimedTextTrack + + +@patch.object(transcript_video, "transcript") +class TranscriptVideoTestCase(TestCase): + """ + Test case for the transcript_video command. + """ + + def setUp(self): + """ + Set up the test case with videos. + """ + self.stdout = StringIO() + + def test_transcript_video_no_videos(self, mock_transcript): + """ + Should not call the transcript function if there is no video to transcript. + """ + call_command("transcript_video", stdout=self.stdout) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + ["No video to transcript"], + ) + mock_transcript.assert_not_called() + + def test_transcript_video_first_video(self, mock_transcript): + """ + Should call the transcript function with the first video to transcript. + """ + VideoFactory(upload_state=defaults.READY) + video = VideoFactory(upload_state=defaults.READY) + + call_command("transcript_video", stdout=self.stdout) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + [ + f"Try to transcript video {video.id}", + f"Transcription job started for video {video.id}", + ], + ) + mock_transcript.assert_called_once_with(video) + + def test_transcript_video_not_ready(self, mock_transcript): + """ + Should not call the transcript function if the video is not ready. + """ + VideoFactory(upload_state=defaults.PENDING) + + call_command("transcript_video", stdout=self.stdout) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + ["No video to transcript"], + ) + mock_transcript.assert_not_called() + + def test_transcript_video_already_transcript(self, mock_transcript): + """ + Should not call the transcript function if the video already has a transcript. + """ + TimedTextTrackFactory( + video=VideoFactory(upload_state=defaults.READY), + mode=TimedTextTrack.TRANSCRIPT, + ) + + call_command("transcript_video", stdout=self.stdout) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + ["No video to transcript"], + ) + mock_transcript.assert_not_called() + + def test_transcript_video_deleted_transcript(self, mock_transcript): + """ + Should call the transcript function if the video has a deleted transcript. + """ + timed_text_track = TimedTextTrackFactory( + video=VideoFactory(upload_state=defaults.READY), + mode=TimedTextTrack.TRANSCRIPT, + ) + timed_text_track.delete() + self.assertEqual(TimedTextTrack.objects.all(force_visibility=True).count(), 1) + + call_command("transcript_video", stdout=self.stdout) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + [ + f"Try to transcript video {timed_text_track.video.id}", + f"Transcription job started for video {timed_text_track.video.id}", + ], + ) + mock_transcript.assert_called_once_with(timed_text_track.video) + + def test_transcript_video_unknown_argument(self, mock_transcript): + """ + Should not call the transcript function if there is no video to transcript. + """ + call_command("transcript_video", stdout=self.stdout, video_id=1) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + ["No video matches the provided id: 1"], + ) + mock_transcript.assert_not_called() + + def test_transcript_video_argument(self, mock_transcript): + """ + Should call the transcript function with the video to transcript. + """ + VideoFactory(upload_state=defaults.READY) + video = VideoFactory(upload_state=defaults.READY) + VideoFactory(upload_state=defaults.READY) + + call_command("transcript_video", stdout=self.stdout, video_id=video.id) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + [ + f"Try to transcript video {video.id}", + f"Transcription job started for video {video.id}", + ], + ) + mock_transcript.assert_called_once_with(video) + + def test_transcript_video_argument_not_ready(self, mock_transcript): + """ + Should not call the transcript function if the video is not ready. + """ + video = VideoFactory(upload_state=defaults.PENDING) + + call_command("transcript_video", stdout=self.stdout, video_id=video.id) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + [f"Video {video.id} is not ready"], + ) + mock_transcript.assert_not_called() + + def test_transcript_video_argument_already_transcript(self, mock_transcript): + """ + Should not call the transcript function if the video already has a transcript. + """ + timed_text_track = TimedTextTrackFactory( + video=VideoFactory(upload_state=defaults.READY), + mode=TimedTextTrack.TRANSCRIPT, + ) + + call_command( + "transcript_video", stdout=self.stdout, video_id=timed_text_track.video.id + ) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + [f"Transcript already exists for video {timed_text_track.video.id}"], + ) + mock_transcript.assert_not_called() + + def test_transcript_video_argument_deleted_transcript(self, mock_transcript): + """ + Should call the transcript function if the video has a deleted transcript. + """ + timed_text_track = TimedTextTrackFactory( + video=VideoFactory(upload_state=defaults.READY), + mode=TimedTextTrack.TRANSCRIPT, + ) + timed_text_track.delete() + self.assertEqual(TimedTextTrack.objects.all(force_visibility=True).count(), 1) + + call_command( + "transcript_video", stdout=self.stdout, video_id=timed_text_track.video.id + ) + + self.assertListEqual( + self.stdout.getvalue().splitlines(), + [ + f"Try to transcript video {timed_text_track.video.id}", + f"Transcription job started for video {timed_text_track.video.id}", + ], + ) + mock_transcript.assert_called_once_with(timed_text_track.video) diff --git a/src/backend/marsha/core/tests/utils/test_transcript.py b/src/backend/marsha/core/tests/utils/test_transcript.py index 07659ec1d5..6b3f303b60 100644 --- a/src/backend/marsha/core/tests/utils/test_transcript.py +++ b/src/backend/marsha/core/tests/utils/test_transcript.py @@ -1,9 +1,10 @@ """Tests for the `core.utils.transcript` module.""" -from unittest import mock +from unittest.mock import patch +from django.conf import settings from django.core.files.uploadedfile import SimpleUploadedFile -from django.test import TestCase +from django.test import TestCase, override_settings from django_peertube_runner_connector.models import ( Video as TranscriptedVideo, @@ -11,11 +12,15 @@ ) from marsha.core import defaults -from marsha.core.factories import UploadedVideoFactory +from marsha.core.factories import ( + TimedTextTrackFactory, + UploadedVideoFactory, + VideoFactory, +) from marsha.core.models import TimedTextTrack from marsha.core.storage.storage_class import video_storage +from marsha.core.utils import transcript_utils from marsha.core.utils.time_utils import to_timestamp -from marsha.core.utils.transcript import transcription_ended_callback from marsha.websocket.utils import channel_layers_utils @@ -40,12 +45,14 @@ def test_transcription_ended_callback(self): f"{video_path}/{video_timestamp}-{language}.vtt", vtt_file ) - with mock.patch.object( + with patch.object( channel_layers_utils, "dispatch_timed_text_track" - ) as mock_dispatch_timed_text_track, mock.patch.object( + ) as mock_dispatch_timed_text_track, patch.object( channel_layers_utils, "dispatch_video" ) as mock_dispatch_video: - transcription_ended_callback(transcripted_video, language, vtt_path) + transcript_utils.transcription_ended_callback( + transcripted_video, language, vtt_path + ) timed_text_track = video.timedtexttracks.get() self.assertEqual(timed_text_track.language, language) @@ -65,3 +72,92 @@ def test_transcription_ended_callback(self): mock_dispatch_timed_text_track.assert_called_once_with(timed_text_track) mock_dispatch_video.assert_called_once_with(video) + + @patch.object(transcript_utils, "launch_video_transcript") + def test_transcript_video_no_video(self, mock_launch_video_transcript): + """ + Should not call the launch_video_transcript function + if there is no video to transcript. + """ + + with self.assertRaises(transcript_utils.TranscriptError) as context: + transcript_utils.transcript(None) + + self.assertEqual(str(context.exception), "No video to transcript") + mock_launch_video_transcript.delay.assert_not_called() + + @patch.object(transcript_utils, "launch_video_transcript") + def test_transcript_video_already_transcript(self, mock_launch_video_transcript): + """ + Should not call the launch_video_transcript function + if the video already has a transcript. + """ + timed_text_track = TimedTextTrackFactory( + video=VideoFactory(upload_state=defaults.READY), + language=settings.LANGUAGES[0][0], + mode=TimedTextTrack.TRANSCRIPT, + ) + + with self.assertRaises(transcript_utils.TranscriptError) as context: + transcript_utils.transcript(timed_text_track.video) + + self.assertEqual( + str(context.exception), + f"A transcript already exists for video {timed_text_track.video.id}", + ) + mock_launch_video_transcript.delay.assert_not_called() + + @patch.object(transcript_utils, "launch_video_transcript") + def test_transcript_video_peertube_pipeline(self, mock_launch_video_transcript): + """ + Should call the launch_video_transcript function + if the video pipeline is peertube. + """ + video = VideoFactory(transcode_pipeline=defaults.PEERTUBE_PIPELINE) + + transcript_utils.transcript(video) + + mock_launch_video_transcript.delay.assert_called_once_with( + video_pk=video.id, + stamp=video.uploaded_on_stamp(), + domain="https://example.com", + ) + self.assertEqual(video.timedtexttracks.count(), 1) + + @patch.object(transcript_utils, "launch_video_transcript") + def test_transcript_video_not_peertube_pipeline(self, mock_launch_video_transcript): + """ + Should call the launch_video_transcript function + if the video pipeline is not peertube. + """ + video = VideoFactory(transcode_pipeline=defaults.AWS_PIPELINE) + + transcript_utils.transcript(video) + + mock_launch_video_transcript.delay.assert_called_once_with( + video_pk=video.id, + stamp=video.uploaded_on_stamp(), + domain="https://example.com", + video_url=f"https://example.com/api/videos/{video.id}/transcript-source/", + ) + self.assertEqual(video.timedtexttracks.count(), 1) + + @patch.object(transcript_utils, "launch_video_transcript") + @override_settings(TRANSCODING_CALLBACK_DOMAIN="https://callback.com") + def test_transcript_video_callback_domain_setting( + self, mock_launch_video_transcript + ): + """ + Should call the launch_video_transcript function + with the callback domain setting. + """ + video = VideoFactory(transcode_pipeline=defaults.PEERTUBE_PIPELINE) + + transcript_utils.transcript(video) + + mock_launch_video_transcript.delay.assert_called_once_with( + video_pk=video.id, + stamp=video.uploaded_on_stamp(), + domain="https://callback.com", + ) + self.assertEqual(video.timedtexttracks.count(), 1) diff --git a/src/backend/marsha/core/utils/transcript_utils.py b/src/backend/marsha/core/utils/transcript_utils.py index 866bc286af..c6180ab395 100644 --- a/src/backend/marsha/core/utils/transcript_utils.py +++ b/src/backend/marsha/core/utils/transcript_utils.py @@ -1,14 +1,60 @@ """Utils related to transcoding""" +from django.conf import settings +from django.contrib.sites.models import Site +from django.db import IntegrityError +from django.urls import reverse + from django_peertube_runner_connector.models import Video as TranscriptedVideo from marsha.core import defaults from marsha.core.models import TimedTextTrack, Video from marsha.core.storage.storage_class import video_storage +from marsha.core.tasks.video import launch_video_transcript from marsha.core.utils.time_utils import to_datetime from marsha.websocket.utils import channel_layers_utils +class TranscriptError(Exception): + """Error raised when an error occurs during the transcript process""" + + +def transcript(video): + """Create a transcript for a video.""" + if not video: + raise TranscriptError("No video to transcript") + + try: + TimedTextTrack.objects.create( + video=video, + language=settings.LANGUAGES[0][0], + mode=TimedTextTrack.TRANSCRIPT, + upload_state=defaults.PROCESSING, + ) + except IntegrityError as e: + raise TranscriptError( + f"A transcript already exists for video {video.id}" + ) from e + + domain = ( + settings.TRANSCODING_CALLBACK_DOMAIN + or f"https://{Site.objects.get_current().domain}" + ) + + transcript_args = { + "video_pk": video.id, + "stamp": video.uploaded_on_stamp(), + "domain": domain, + } + + if video.transcode_pipeline != defaults.PEERTUBE_PIPELINE: + transcript_args["video_url"] = domain + reverse( + "videos-transcript-source", kwargs={"pk": video.id} + ) + + launch_video_transcript.delay(**transcript_args) + + def transcription_ended_callback( transcripted_video: TranscriptedVideo, language: str, vtt_path: str ):