diff --git a/test/test_utils.py b/test/test_utils.py
index 0896f41506a..74a7792fc04 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1401,8 +1401,49 @@ def test_get_element_by_class(self):
'''
self.assertEqual(get_element_by_class('foo', html), 'nice')
+ self.assertEqual(get_element_by_class('foo', html, include_tag=True), 'nice')
self.assertEqual(get_element_by_class('no-such-class', html), None)
+ html = '''
+
+ '''
+
+ self.assertEqual(get_element_by_class('foo', html), None)
+ self.assertEqual(get_element_by_class('foo', html, include_tag=True), '')
+
+ html = '''
+
+ '''
+
+ self.assertEqual(get_element_by_class('foo', html), '')
+ self.assertEqual(get_element_by_class('foo', html, include_tag=True), '')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice')
+ self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), 'nice')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('_test_underscore', html), 'nice')
+
+ html = '''
+ nice
+ '''
+
+ self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice')
+ self.assertEqual(get_element_by_class('↑-unicode', html), 'nice')
+
def test_get_element_by_attribute(self):
html = '''
nice
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index c79c58e8281..21dfe850dd7 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -2,8 +2,10 @@
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
clean_html,
+ extract_attributes,
+ get_element_by_class,
+ unified_strdate,
)
@@ -40,19 +42,23 @@ def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://archive.org/embed/' + video_id, video_id)
- jwplayer_playlist = self._parse_json(self._search_regex(
- r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)",
- webpage, 'jwplayer playlist'), video_id)
+ input_element_with_playlist = get_element_by_class(
+ 'js-play8-playlist', webpage, include_tag=True)
+ jwplayer_playlist = self._parse_json(extract_attributes(
+ input_element_with_playlist)['value'], video_id)
info = self._parse_jwplayer_data(
{'playlist': jwplayer_playlist}, video_id, base_url=url)
def get_optional(metadata, field):
return metadata.get(field, [None])[0]
- metadata = self._download_json(
+ json_metadata = self._download_json(
'http://archive.org/details/' + video_id, video_id, query={
'output': 'json',
- })['metadata']
+ }, fatal=False)
+ metadata = (json_metadata.get('metadata', {})
+ if isinstance(json_metadata, dict)
+ else {})
info.update({
'title': get_optional(metadata, 'title') or info.get('title'),
'description': clean_html(get_optional(metadata, 'description')),
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index f6204692a81..4149f4dc5f9 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1926,32 +1926,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
return n.attrib[key]
-def get_element_by_id(id, html):
- """Return the content of the tag with the specified ID in the passed HTML document"""
- return get_element_by_attribute('id', id, html)
+def get_element_by_id(id, html, include_tag=False):
+ """
+ Return the content of the tag with the specified ID in the passed HTML document.
+
+ The whole element, including its tag, is returned when `include_flag` is `True`.
+ """
+ return get_element_by_attribute('id', id, html, include_tag)
+
+def get_element_by_class(class_name, html, include_tag=False):
+ """
+ Return the content of the first tag with the specified class in the passed HTML document.
-def get_element_by_class(class_name, html):
- """Return the content of the first tag with the specified class in the passed HTML document"""
- retval = get_elements_by_class(class_name, html)
+ The whole element, including its tag, is returned when `include_flag` is `True`.
+ """
+ retval = get_elements_by_class(class_name, html, include_tag)
return retval[0] if retval else None
-def get_element_by_attribute(attribute, value, html, escape_value=True):
- retval = get_elements_by_attribute(attribute, value, html, escape_value)
+def get_element_by_attribute(attribute, value, html, escape_value=True,
+ include_tag=False):
+ """
+ Return the content of the first tag with the specified attribute in the passed HTML document.
+
+ The whole element, including its tag, is returned when `include_flag` is `True`.
+ """
+ retval = get_elements_by_attribute(attribute, value, html, escape_value,
+ include_tag)
return retval[0] if retval else None
-def get_elements_by_class(class_name, html):
- """Return the content of all tags with the specified class in the passed HTML document as a list"""
+def get_elements_by_class(class_name, html, include_tag=False):
+ """
+ Return the content of all tags with the specified class in the passed HTML document as a list.
+
+ The whole elements, including their tags, are returned when `include_flag` is `True`.
+ """
return get_elements_by_attribute(
- 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
- html, escape_value=False)
+ 'class', r'[^\'"]*(?
+ \s*(?:\/\s*>|>
(?P.*?)
- \1>
+ \1>)
''' % (re.escape(attribute), value), html):
- res = m.group('content')
+ res = m.group(0) if include_tag else m.group('content')
+ if res is None:
+ continue
if res.startswith('"') or res.startswith("'"):
res = res[1:-1]
@@ -1981,7 +2006,10 @@ def __init__(self):
compat_HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
- self.attrs = dict(attrs)
+ # Make sure we're looking at the first attributes. Later ones are from
+ # embedded elements.
+ if not self.attrs:
+ self.attrs = dict(attrs)
def extract_attributes(html_element):