Skip to content

Commit

Permalink
[bot] AutoMerging: merge all upstream's changes:
Browse files Browse the repository at this point in the history
* https://github.com/ytdl-org/youtube-dl:
  [InfoExtractor] Misc yt-dlp back-ports, etc * add _yes_playlist() method * avoid crash using _NETRC_MACHINE * use _search_json() in _search_nextjs_data() * _search_nextjs_data() default is JSON, not text * test for above
  [compat] Avoid type comparison in `compat_ord` NB This isn't actually a compat fn; it should be utils.int_from_int_or_char
  [utils] Split out traversal.py dummy and traversal tests
  [compat] Improve compat_etree_iterfind for Py2.6 Adapted from https://raw.githubusercontent.com/python/cpython/2.7/Lib/xml/etree/ElementPath.py
  [utils] Update traverse_obj() from yt-dlp * remove `is_user_input` option per yt-dlp/yt-dlp#8673 * support traversal of compat_xml_etree_ElementTree_Element per yt-dlp/yt-dlp#8911 * allow un/branching using all and any per yt-dlp/yt-dlp#9571 * support traversal of compat_cookies.Morsel and multiple types in `set()` keys per yt-dlp/yt-dlp#9577 thx Grub4k for these * also, move traversal tests to a separate class * allow for unordered dicts in tests for Py<3.7
  • Loading branch information
github-actions[bot] committed May 30, 2024
2 parents 19f39f2 + 2192474 commit 26724e4
Show file tree
Hide file tree
Showing 7 changed files with 856 additions and 412 deletions.
3 changes: 3 additions & 0 deletions test/test_InfoExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def test_search_nextjs_data(self):
'''
search = self.ie._search_nextjs_data(html, 'testID')
self.assertEqual(search['props']['pageProps']['video']['id'], 'testid')
search = self.ie._search_nextjs_data(
'no next.js data here, move along', 'testID', default={'status': 0})
self.assertEqual(search['status'], 0)

def test_search_nuxt_data(self):
html = '''
Expand Down
509 changes: 509 additions & 0 deletions test/test_traversal.py

Large diffs are not rendered by default.

362 changes: 0 additions & 362 deletions test/test_utils.py

Large diffs are not rendered by default.

219 changes: 217 additions & 2 deletions youtube_dl/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2719,8 +2719,222 @@ def compat_xpath(xpath):
if isinstance(xpath, compat_str):
xpath = xpath.encode('ascii')
return xpath

# further code below based on CPython 2.7 source
import functools

_xpath_tokenizer_re = re.compile(r'''(?x)
( # (1)
'[^']*'|"[^"]*"| # quoted strings, or
::|//?|\.\.|\(\)|[/.*:[\]()@=] # navigation specials
)| # or (2)
((?:\{[^}]+\})?[^/[\]()@=\s]+)| # token: optional {ns}, no specials
\s+ # or white space
''')

def _xpath_tokenizer(pattern, namespaces=None):
for token in _xpath_tokenizer_re.findall(pattern):
tag = token[1]
if tag and tag[0] != "{" and ":" in tag:
try:
if not namespaces:
raise KeyError
prefix, uri = tag.split(":", 1)
yield token[0], "{%s}%s" % (namespaces[prefix], uri)
except KeyError:
raise SyntaxError("prefix %r not found in prefix map" % prefix)
else:
yield token

def _get_parent_map(context):
parent_map = context.parent_map
if parent_map is None:
context.parent_map = parent_map = {}
for p in context.root.getiterator():
for e in p:
parent_map[e] = p
return parent_map

def _select(context, result, filter_fn=lambda *_: True):
for elem in result:
for e in elem:
if filter_fn(e, elem):
yield e

def _prepare_child(next_, token):
tag = token[1]
return functools.partial(_select, filter_fn=lambda e, _: e.tag == tag)

def _prepare_star(next_, token):
return _select

def _prepare_self(next_, token):
return lambda _, result: (e for e in result)

def _prepare_descendant(next_, token):
token = next(next_)
if token[0] == "*":
tag = "*"
elif not token[0]:
tag = token[1]
else:
raise SyntaxError("invalid descendant")

def select(context, result):
for elem in result:
for e in elem.getiterator(tag):
if e is not elem:
yield e
return select

def _prepare_parent(next_, token):
def select(context, result):
# FIXME: raise error if .. is applied at toplevel?
parent_map = _get_parent_map(context)
result_map = {}
for elem in result:
if elem in parent_map:
parent = parent_map[elem]
if parent not in result_map:
result_map[parent] = None
yield parent
return select

def _prepare_predicate(next_, token):
signature = []
predicate = []
for token in next_:
if token[0] == "]":
break
if token[0] and token[0][:1] in "'\"":
token = "'", token[0][1:-1]
signature.append(token[0] or "-")
predicate.append(token[1])

def select(context, result, filter_fn=lambda _: True):
for elem in result:
if filter_fn(elem):
yield elem

signature = "".join(signature)
# use signature to determine predicate type
if signature == "@-":
# [@attribute] predicate
key = predicate[1]
return functools.partial(
select, filter_fn=lambda el: el.get(key) is not None)
if signature == "@-='":
# [@attribute='value']
key = predicate[1]
value = predicate[-1]
return functools.partial(
select, filter_fn=lambda el: el.get(key) == value)
if signature == "-" and not re.match(r"\d+$", predicate[0]):
# [tag]
tag = predicate[0]
return functools.partial(
select, filter_fn=lambda el: el.find(tag) is not None)
if signature == "-='" and not re.match(r"\d+$", predicate[0]):
# [tag='value']
tag = predicate[0]
value = predicate[-1]

def itertext(el):
for e in el.getiterator():
e = e.text
if e:
yield e

def select(context, result):
for elem in result:
for e in elem.findall(tag):
if "".join(itertext(e)) == value:
yield elem
break
return select
if signature == "-" or signature == "-()" or signature == "-()-":
# [index] or [last()] or [last()-index]
if signature == "-":
index = int(predicate[0]) - 1
else:
if predicate[0] != "last":
raise SyntaxError("unsupported function")
if signature == "-()-":
try:
index = int(predicate[2]) - 1
except ValueError:
raise SyntaxError("unsupported expression")
else:
index = -1

def select(context, result):
parent_map = _get_parent_map(context)
for elem in result:
try:
parent = parent_map[elem]
# FIXME: what if the selector is "*" ?
elems = list(parent.findall(elem.tag))
if elems[index] is elem:
yield elem
except (IndexError, KeyError):
pass
return select
raise SyntaxError("invalid predicate")

ops = {
"": _prepare_child,
"*": _prepare_star,
".": _prepare_self,
"..": _prepare_parent,
"//": _prepare_descendant,
"[": _prepare_predicate,
}

_cache = {}

class _SelectorContext:
parent_map = None

def __init__(self, root):
self.root = root

##
# Generate all matching objects.

def compat_etree_iterfind(elem, path, namespaces=None):
# compile selector pattern
if path[-1:] == "/":
path = path + "*" # implicit all (FIXME: keep this?)
try:
selector = _cache[path]
except KeyError:
if len(_cache) > 100:
_cache.clear()
if path[:1] == "/":
raise SyntaxError("cannot use absolute path on element")
tokens = _xpath_tokenizer(path, namespaces)
selector = []
for token in tokens:
if token[0] == "/":
continue
try:
selector.append(ops[token[0]](tokens, token))
except StopIteration:
raise SyntaxError("invalid path")
_cache[path] = selector
# execute selector pattern
result = [elem]
context = _SelectorContext(elem)
for select in selector:
result = select(context, result)
return result

# end of code based on CPython 2.7 source


else:
compat_xpath = lambda xpath: xpath
compat_etree_iterfind = lambda element, match: element.iterfind(match)


compat_os_name = os._name if os.name == 'java' else os.name
Expand Down Expand Up @@ -2756,7 +2970,7 @@ def compat_shlex_split(s, comments=False, posix=True):


def compat_ord(c):
if type(c) is int:
if isinstance(c, int):
return c
else:
return ord(c)
Expand Down Expand Up @@ -2955,7 +3169,7 @@ def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
return exc_val is not None and isinstance(exc_val, self._exceptions or tuple())
return exc_type is not None and issubclass(exc_type, self._exceptions or tuple())


# subprocess.Popen context manager
Expand Down Expand Up @@ -3308,6 +3522,7 @@ def compat_datetime_timedelta_total_seconds(td):
'compat_contextlib_suppress',
'compat_ctypes_WINFUNCTYPE',
'compat_etree_fromstring',
'compat_etree_iterfind',
'compat_filter',
'compat_get_terminal_size',
'compat_getenv',
Expand Down
63 changes: 47 additions & 16 deletions youtube_dl/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,18 +1169,18 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr
def _get_netrc_login_info(self, netrc_machine=None):
username = None
password = None
netrc_machine = netrc_machine or self._NETRC_MACHINE

if self._downloader.params.get('usenetrc', False):
try:
netrc_machine = netrc_machine or self._NETRC_MACHINE
info = netrc.netrc().authenticators(netrc_machine)
if info is not None:
username = info[0]
password = info[2]
else:
raise netrc.NetrcParseError(
'No authenticators for %s' % netrc_machine)
except (IOError, netrc.NetrcParseError) as err:
except (AttributeError, IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning(
'parsing .netrc: %s' % error_to_compat_str(err))

Expand Down Expand Up @@ -1490,14 +1490,18 @@ def extract_video_object(e):
return dict((k, v) for k, v in info.items() if v is not None)

def _search_nextjs_data(self, webpage, video_id, **kw):
nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
kw.pop('transform_source', None)
next_data = self._search_regex(
r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
webpage, 'next.js data', group='nd', **kw)
if not next_data:
return {}
return self._parse_json(next_data, video_id, **nkw)
# ..., *, transform_source=None, fatal=True, default=NO_DEFAULT

# TODO: remove this backward compat
default = kw.get('default', NO_DEFAULT)
if default == '{}':
kw['default'] = {}
kw = compat_kwargs(kw)

return self._search_json(
r'''<script\s[^>]*?\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>''',
webpage, 'next.js data', video_id, end_pattern='</script>',
**kw)

def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
Expand Down Expand Up @@ -3296,12 +3300,16 @@ def _merge_subtitle_items(subtitle_list1, subtitle_list2):
return ret

@classmethod
def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
""" Merge two subtitle dictionaries, language by language. """
ret = dict(subtitle_dict1)
for lang in subtitle_dict2:
ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
return ret
def _merge_subtitles(cls, subtitle_dict1, *subtitle_dicts, **kwargs):
""" Merge subtitle dictionaries, language by language. """

# ..., * , target=None
target = kwargs.get('target') or dict(subtitle_dict1)

for subtitle_dict in subtitle_dicts:
for lang in subtitle_dict:
target[lang] = cls._merge_subtitle_items(target.get(lang, []), subtitle_dict[lang])
return target

def extract_automatic_captions(self, *args, **kwargs):
if (self._downloader.params.get('writeautomaticsub', False)
Expand Down Expand Up @@ -3334,6 +3342,29 @@ def _generic_id(self, url):
def _generic_title(self, url):
return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])

def _yes_playlist(self, playlist_id, video_id, *args, **kwargs):
# smuggled_data=None, *, playlist_label='playlist', video_label='video'
smuggled_data = args[0] if len(args) == 1 else kwargs.get('smuggled_data')
playlist_label = kwargs.get('playlist_label', 'playlist')
video_label = kwargs.get('video_label', 'video')

if not playlist_id or not video_id:
return not video_id

no_playlist = (smuggled_data or {}).get('force_noplaylist')
if no_playlist is not None:
return not no_playlist

video_id = '' if video_id is True else ' ' + video_id
noplaylist = self.get_param('noplaylist')
self.to_screen(
'Downloading just the {0}{1} because of --no-playlist'.format(video_label, video_id)
if noplaylist else
'Downloading {0}{1} - add --no-playlist to download just the {2}{3}'.format(
playlist_label, '' if playlist_id is True else ' ' + playlist_id,
video_label, video_id))
return not noplaylist


class SearchInfoExtractor(InfoExtractor):
"""
Expand Down
10 changes: 10 additions & 0 deletions youtube_dl/traversal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# coding: utf-8

# TODO: move these utils.fns here and move import to utils
# flake8: noqa
from .utils import (
dict_get,
get_first,
T,
traverse_obj,
)
Loading

0 comments on commit 26724e4

Please sign in to comment.