Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -261,22 +261,16 @@ def test_migrate_transcripts_exception_logging(self):
u'[Transcript migration] process for ge transcript started'),
(LOGGER_NAME,
'ERROR',
'[Transcript migration] Exception: u"SON(['
'(\'category\', \'asset\'), (\'name\', u\'not_found.srt\'),'
' (\'course\', u\'{}\'), (\'tag\', \'c4x\'), (\'org\', u\'{}\'),'
' (\'revision\', None)])"'.format(self.course_2.id.course, self.course_2.id.org)),
"[Transcript migration] Exception: u'No transcript for `ge` language'"),
(LOGGER_NAME,
'INFO',
u'[Transcript migration] process for course {} ended. Processed 1 transcripts'.format(
unicode(self.course_2.id)
)),
(LOGGER_NAME,
'INFO',
"[Transcript migration] Result: Failed: language ge of video test_edx_video_id_2 with exception SON(["
"('category', 'asset'), ('name', u'not_found.srt'), ('course', u'{}'),"
" ('tag', 'c4x'), ('org', u'{}'), ('revision', None)])".format(
self.course_2.id.course, self.course_2.id.org)
)
"[Transcript migration] Result: Failed: language ge of video test_edx_video_id_2 with exception "
"No transcript for `ge` language")
)

with LogCapture(LOGGER_NAME, level=logging.INFO) as logger:
Expand Down
75 changes: 59 additions & 16 deletions cms/djangoapps/contentstore/tests/test_transcripts_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,29 +744,34 @@ def setUp(self):
edx_video_id=u'1234-5678-90'
)

def create_transcript(self, subs_id, language=u'en', filename='video.srt'):
def create_transcript(self, subs_id, language=u'en', filename='video.srt', youtube_id_1_0='', html5_sources=None):
"""
create transcript.
"""
transcripts = {}
if language != u'en':
transcripts = {language: filename}

html5_sources = html5_sources or []
self.video = ItemFactory.create(
category='video',
parent_location=self.vertical.location,
sub=subs_id,
youtube_id_1_0=youtube_id_1_0,
transcripts=transcripts,
edx_video_id=u'1234-5678-90'
edx_video_id=u'1234-5678-90',
html5_sources=html5_sources
)

if subs_id:
transcripts_utils.save_subs_to_store(
self.subs_sjson,
subs_id,
self.video,
language=language,
)
possible_subs = [subs_id, youtube_id_1_0] + transcripts_utils.get_html5_ids(html5_sources)
for possible_sub in possible_subs:
if possible_sub:
transcripts_utils.save_subs_to_store(
self.subs_sjson,
possible_sub,
self.video,
language=language,
)

def create_srt_file(self, content):
"""
Expand Down Expand Up @@ -812,31 +817,69 @@ def test_get_transcript_not_found(self, lang):
)

@ddt.data(
# video.sub transcript
{
'language': u'en',
'subs_id': 'video_101',
'youtube_id_1_0': '',
'html5_sources': [],
'expected_filename': 'en_video_101.srt',
},
# if video.sub is present, rest will be skipped.
{
'language': u'en',
'subs_id': 'video_101',
'filename': 'en_video_101.srt',
'youtube_id_1_0': 'test_yt_id',
'html5_sources': ['www.abc.com/foo.mp4'],
'expected_filename': 'en_video_101.srt',
},
# video.youtube_id_1_0 transcript
{
'language': u'en',
'subs_id': '',
'youtube_id_1_0': 'test_yt_id',
'html5_sources': [],
'expected_filename': 'en_test_yt_id.srt',
},
# video.html5_sources transcript
{
'language': u'en',
'subs_id': '',
'youtube_id_1_0': '',
'html5_sources': ['www.abc.com/foo.mp4'],
'expected_filename': 'en_foo.srt',
},
# non-english transcript
{
'language': u'ur',
'subs_id': '',
'filename': 'ur_video_101.srt',
'youtube_id_1_0': '',
'html5_sources': [],
'expected_filename': 'ur_video_101.srt',
},
)
@ddt.unpack
def test_get_transcript_from_content_store(self, language, subs_id, filename):
def test_get_transcript_from_contentstore(
self,
language,
subs_id,
youtube_id_1_0,
html5_sources,
expected_filename
):
"""
Verify that `get_transcript` function returns correct data when transcript is in content store.
"""
self.upload_file(self.create_srt_file(self.subs_srt), self.video.location, filename)
self.create_transcript(subs_id, language, filename)
content, filename, mimetype = transcripts_utils.get_transcript(
base_filename = 'video_101.srt'
self.upload_file(self.create_srt_file(self.subs_srt), self.video.location, base_filename)
self.create_transcript(subs_id, language, base_filename, youtube_id_1_0, html5_sources)
content, file_name, mimetype = transcripts_utils.get_transcript(
self.video,
language
)

self.assertEqual(content, self.subs[language])
self.assertEqual(filename, filename)
self.assertEqual(file_name, expected_filename)
self.assertEqual(mimetype, self.srt_mime_type)

def test_get_transcript_from_content_store_for_ur(self):
Expand Down
39 changes: 20 additions & 19 deletions common/lib/xmodule/xmodule/video_module/transcripts_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ def get_transcript_from_val(edx_video_id, lang=None, output_format=Transcript.SR
"""
Get video transcript from edx-val.
Arguments:
edx_video_id (unicode): course identifier
edx_video_id (unicode): video identifier
lang (unicode): transcript language
output_format (unicode): transcript output format
Returns:
Expand Down Expand Up @@ -923,31 +923,32 @@ def get_transcript_from_contentstore(video, language, output_format, transcripts
Returns:
tuple containing content, filename, mimetype
"""
input_format, base_name, transcript_content = None, None, None
if output_format not in (Transcript.SRT, Transcript.SJSON, Transcript.TXT):
raise NotFoundError('Invalid transcript format `{output_format}`'.format(output_format=output_format))

sub, other_languages = transcripts_info['sub'], transcripts_info['transcripts']
transcripts = dict(other_languages)

# this is sent in case of a translation dispatch and we need to use it as our subs_id.
if youtube_id:
transcripts['en'] = youtube_id
elif sub:
transcripts['en'] = sub
elif video.youtube_id_1_0:
transcripts['en'] = video.youtube_id_1_0
elif language == u'en':
raise NotFoundError('No transcript for `en` language')

try:
input_format, base_name, transcript_content = get_transcript_for_video(
video.location,
subs_id=transcripts.get('en'),
file_name=transcripts[language],
language=language
)
except KeyError:
raise NotFoundError
possible_sub_ids = [youtube_id, sub, video.youtube_id_1_0] + get_html5_ids(video.html5_sources)
for sub_id in possible_sub_ids:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about for sub_id in set(possible_sub_ids): to avoid duplicates?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set reorders the element in ascending order which is what we don't want.

try:
transcripts[u'en'] = sub_id
input_format, base_name, transcript_content = get_transcript_for_video(
video.location,
subs_id=sub_id,
file_name=transcripts[language],
language=language
)
break
except (KeyError, NotFoundError):
continue

if transcript_content is None:
raise NotFoundError('No transcript for `{lang}` language'.format(
lang=language
))

# add language prefix to transcript file only if language is not None
language_prefix = '{}_'.format(language) if language else ''
Expand Down