diff --git a/test/test_utils.py b/test/test_utils.py index 0896f41506a..74a7792fc04 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1401,8 +1401,49 @@ def test_get_element_by_class(self): ''' self.assertEqual(get_element_by_class('foo', html), 'nice') + self.assertEqual(get_element_by_class('foo', html, include_tag=True), 'nice') self.assertEqual(get_element_by_class('no-such-class', html), None) + html = ''' + + ''' + + self.assertEqual(get_element_by_class('foo', html), None) + self.assertEqual(get_element_by_class('foo', html, include_tag=True), '') + + html = ''' + + ''' + + self.assertEqual(get_element_by_class('foo', html), '') + self.assertEqual(get_element_by_class('foo', html, include_tag=True), '') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice') + self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('_test_underscore', html), 'nice') + + html = ''' + nice + ''' + + self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice') + self.assertEqual(get_element_by_class('↑-unicode', html), 'nice') + def test_get_element_by_attribute(self): html = ''' nice diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index c79c58e8281..21dfe850dd7 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -2,8 +2,10 @@ from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + get_element_by_class, + unified_strdate, ) @@ -40,19 +42,23 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) + input_element_with_playlist = get_element_by_class( + 'js-play8-playlist', webpage, include_tag=True) + jwplayer_playlist = self._parse_json(extract_attributes( + input_element_with_playlist)['value'], video_id) info = self._parse_jwplayer_data( {'playlist': jwplayer_playlist}, video_id, base_url=url) def get_optional(metadata, field): return metadata.get(field, [None])[0] - metadata = self._download_json( + json_metadata = self._download_json( 'http://archive.org/details/' + video_id, video_id, query={ 'output': 'json', - })['metadata'] + }, fatal=False) + metadata = (json_metadata.get('metadata', {}) + if isinstance(json_metadata, dict) + else {}) info.update({ 'title': get_optional(metadata, 'title') or info.get('title'), 'description': clean_html(get_optional(metadata, 'description')), diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index f6204692a81..4149f4dc5f9 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1926,32 +1926,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): return n.attrib[key] -def get_element_by_id(id, html): - """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html) +def get_element_by_id(id, html, include_tag=False): + """ + Return the content of the tag with the specified ID in the passed HTML document. + + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + return get_element_by_attribute('id', id, html, include_tag) + +def get_element_by_class(class_name, html, include_tag=False): + """ + Return the content of the first tag with the specified class in the passed HTML document. -def get_element_by_class(class_name, html): - """Return the content of the first tag with the specified class in the passed HTML document""" - retval = get_elements_by_class(class_name, html) + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + retval = get_elements_by_class(class_name, html, include_tag) return retval[0] if retval else None -def get_element_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_by_attribute(attribute, value, html, escape_value) +def get_element_by_attribute(attribute, value, html, escape_value=True, + include_tag=False): + """ + Return the content of the first tag with the specified attribute in the passed HTML document. + + The whole element, including its tag, is returned when `include_flag` is `True`. + """ + retval = get_elements_by_attribute(attribute, value, html, escape_value, + include_tag) return retval[0] if retval else None -def get_elements_by_class(class_name, html): - """Return the content of all tags with the specified class in the passed HTML document as a list""" +def get_elements_by_class(class_name, html, include_tag=False): + """ + Return the content of all tags with the specified class in the passed HTML document as a list. + + The whole elements, including their tags, are returned when `include_flag` is `True`. + """ return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), - html, escape_value=False) + 'class', r'[^\'"]*(? + \s*(?:\/\s*>|> (?P.*?) - + ) ''' % (re.escape(attribute), value), html): - res = m.group('content') + res = m.group(0) if include_tag else m.group('content') + if res is None: + continue if res.startswith('"') or res.startswith("'"): res = res[1:-1] @@ -1981,7 +2006,10 @@ def __init__(self): compat_HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): - self.attrs = dict(attrs) + # Make sure we're looking at the first attributes. Later ones are from + # embedded elements. + if not self.attrs: + self.attrs = dict(attrs) def extract_attributes(html_element):