Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[utils,ie] Misc yt-dlp back-ports, etc #32799

Merged
merged 5 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions test/test_InfoExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def test_search_nextjs_data(self):
'''
search = self.ie._search_nextjs_data(html, 'testID')
self.assertEqual(search['props']['pageProps']['video']['id'], 'testid')
search = self.ie._search_nextjs_data(
'no next.js data here, move along', 'testID', default={'status': 0})
self.assertEqual(search['status'], 0)

def test_search_nuxt_data(self):
html = '''
Expand Down
509 changes: 509 additions & 0 deletions test/test_traversal.py

Large diffs are not rendered by default.

362 changes: 0 additions & 362 deletions test/test_utils.py

Large diffs are not rendered by default.

219 changes: 217 additions & 2 deletions youtube_dl/compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2719,8 +2719,222 @@ def compat_xpath(xpath):
if isinstance(xpath, compat_str):
xpath = xpath.encode('ascii')
return xpath

# further code below based on CPython 2.7 source
import functools

_xpath_tokenizer_re = re.compile(r'''(?x)
( # (1)
'[^']*'|"[^"]*"| # quoted strings, or
::|//?|\.\.|\(\)|[/.*:[\]()@=] # navigation specials
)| # or (2)
((?:\{[^}]+\})?[^/[\]()@=\s]+)| # token: optional {ns}, no specials
\s+ # or white space
''')

def _xpath_tokenizer(pattern, namespaces=None):
for token in _xpath_tokenizer_re.findall(pattern):
tag = token[1]
if tag and tag[0] != "{" and ":" in tag:
try:
if not namespaces:
raise KeyError
prefix, uri = tag.split(":", 1)
yield token[0], "{%s}%s" % (namespaces[prefix], uri)
except KeyError:
raise SyntaxError("prefix %r not found in prefix map" % prefix)
else:
yield token

def _get_parent_map(context):
parent_map = context.parent_map
if parent_map is None:
context.parent_map = parent_map = {}
for p in context.root.getiterator():
for e in p:
parent_map[e] = p
return parent_map

def _select(context, result, filter_fn=lambda *_: True):
for elem in result:
for e in elem:
if filter_fn(e, elem):
yield e

def _prepare_child(next_, token):
tag = token[1]
return functools.partial(_select, filter_fn=lambda e, _: e.tag == tag)

def _prepare_star(next_, token):
return _select

def _prepare_self(next_, token):
return lambda _, result: (e for e in result)

def _prepare_descendant(next_, token):
token = next(next_)
if token[0] == "*":
tag = "*"
elif not token[0]:
tag = token[1]
else:
raise SyntaxError("invalid descendant")

def select(context, result):
for elem in result:
for e in elem.getiterator(tag):
if e is not elem:
yield e
return select

def _prepare_parent(next_, token):
def select(context, result):
# FIXME: raise error if .. is applied at toplevel?
parent_map = _get_parent_map(context)
result_map = {}
for elem in result:
if elem in parent_map:
parent = parent_map[elem]
if parent not in result_map:
result_map[parent] = None
yield parent
return select

def _prepare_predicate(next_, token):
signature = []
predicate = []
for token in next_:
if token[0] == "]":
break
if token[0] and token[0][:1] in "'\"":
token = "'", token[0][1:-1]
signature.append(token[0] or "-")
predicate.append(token[1])

def select(context, result, filter_fn=lambda _: True):
for elem in result:
if filter_fn(elem):
yield elem

signature = "".join(signature)
# use signature to determine predicate type
if signature == "@-":
# [@attribute] predicate
key = predicate[1]
return functools.partial(
select, filter_fn=lambda el: el.get(key) is not None)
if signature == "@-='":
# [@attribute='value']
key = predicate[1]
value = predicate[-1]
return functools.partial(
select, filter_fn=lambda el: el.get(key) == value)
if signature == "-" and not re.match(r"\d+$", predicate[0]):
# [tag]
tag = predicate[0]
return functools.partial(
select, filter_fn=lambda el: el.find(tag) is not None)
if signature == "-='" and not re.match(r"\d+$", predicate[0]):
# [tag='value']
tag = predicate[0]
value = predicate[-1]

def itertext(el):
for e in el.getiterator():
e = e.text
if e:
yield e

def select(context, result):
for elem in result:
for e in elem.findall(tag):
if "".join(itertext(e)) == value:
yield elem
break
return select
if signature == "-" or signature == "-()" or signature == "-()-":
# [index] or [last()] or [last()-index]
if signature == "-":
index = int(predicate[0]) - 1
else:
if predicate[0] != "last":
raise SyntaxError("unsupported function")
if signature == "-()-":
try:
index = int(predicate[2]) - 1
except ValueError:
raise SyntaxError("unsupported expression")
else:
index = -1

def select(context, result):
parent_map = _get_parent_map(context)
for elem in result:
try:
parent = parent_map[elem]
# FIXME: what if the selector is "*" ?
elems = list(parent.findall(elem.tag))
if elems[index] is elem:
yield elem
except (IndexError, KeyError):
pass
return select
raise SyntaxError("invalid predicate")

ops = {
"": _prepare_child,
"*": _prepare_star,
".": _prepare_self,
"..": _prepare_parent,
"//": _prepare_descendant,
"[": _prepare_predicate,
}

_cache = {}

class _SelectorContext:
parent_map = None

def __init__(self, root):
self.root = root

##
# Generate all matching objects.

def compat_etree_iterfind(elem, path, namespaces=None):
# compile selector pattern
if path[-1:] == "/":
path = path + "*" # implicit all (FIXME: keep this?)
try:
selector = _cache[path]
except KeyError:
if len(_cache) > 100:
_cache.clear()
if path[:1] == "/":
raise SyntaxError("cannot use absolute path on element")
tokens = _xpath_tokenizer(path, namespaces)
selector = []
for token in tokens:
if token[0] == "/":
continue
try:
selector.append(ops[token[0]](tokens, token))
except StopIteration:
raise SyntaxError("invalid path")
_cache[path] = selector
# execute selector pattern
result = [elem]
context = _SelectorContext(elem)
for select in selector:
result = select(context, result)
return result

# end of code based on CPython 2.7 source


else:
compat_xpath = lambda xpath: xpath
compat_etree_iterfind = lambda element, match: element.iterfind(match)


compat_os_name = os._name if os.name == 'java' else os.name
Expand Down Expand Up @@ -2756,7 +2970,7 @@ def compat_shlex_split(s, comments=False, posix=True):


def compat_ord(c):
if type(c) is int:
if isinstance(c, int):
return c
else:
return ord(c)
Expand Down Expand Up @@ -2955,7 +3169,7 @@ def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
return exc_val is not None and isinstance(exc_val, self._exceptions or tuple())
return exc_type is not None and issubclass(exc_type, self._exceptions or tuple())


# subprocess.Popen context manager
Expand Down Expand Up @@ -3308,6 +3522,7 @@ def compat_datetime_timedelta_total_seconds(td):
'compat_contextlib_suppress',
'compat_ctypes_WINFUNCTYPE',
'compat_etree_fromstring',
'compat_etree_iterfind',
'compat_filter',
'compat_get_terminal_size',
'compat_getenv',
Expand Down
63 changes: 47 additions & 16 deletions youtube_dl/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1169,18 +1169,18 @@ def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=Tr
def _get_netrc_login_info(self, netrc_machine=None):
username = None
password = None
netrc_machine = netrc_machine or self._NETRC_MACHINE

if self._downloader.params.get('usenetrc', False):
try:
netrc_machine = netrc_machine or self._NETRC_MACHINE
info = netrc.netrc().authenticators(netrc_machine)
if info is not None:
username = info[0]
password = info[2]
else:
raise netrc.NetrcParseError(
'No authenticators for %s' % netrc_machine)
except (IOError, netrc.NetrcParseError) as err:
except (AttributeError, IOError, netrc.NetrcParseError) as err:
self._downloader.report_warning(
'parsing .netrc: %s' % error_to_compat_str(err))

Expand Down Expand Up @@ -1490,14 +1490,18 @@ def extract_video_object(e):
return dict((k, v) for k, v in info.items() if v is not None)

def _search_nextjs_data(self, webpage, video_id, **kw):
nkw = dict((k, v) for k, v in kw.items() if k in ('transform_source', 'fatal'))
kw.pop('transform_source', None)
next_data = self._search_regex(
r'''<script[^>]+\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>(?P<nd>[^<]+)</script>''',
webpage, 'next.js data', group='nd', **kw)
if not next_data:
return {}
return self._parse_json(next_data, video_id, **nkw)
# ..., *, transform_source=None, fatal=True, default=NO_DEFAULT

# TODO: remove this backward compat
default = kw.get('default', NO_DEFAULT)
if default == '{}':
kw['default'] = {}
kw = compat_kwargs(kw)

return self._search_json(
r'''<script\s[^>]*?\bid\s*=\s*('|")__NEXT_DATA__\1[^>]*>''',
webpage, 'next.js data', video_id, end_pattern='</script>',
**kw)

def _search_nuxt_data(self, webpage, video_id, *args, **kwargs):
"""Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function"""
Expand Down Expand Up @@ -3296,12 +3300,16 @@ def _merge_subtitle_items(subtitle_list1, subtitle_list2):
return ret

@classmethod
def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
""" Merge two subtitle dictionaries, language by language. """
ret = dict(subtitle_dict1)
for lang in subtitle_dict2:
ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
return ret
def _merge_subtitles(cls, subtitle_dict1, *subtitle_dicts, **kwargs):
""" Merge subtitle dictionaries, language by language. """

# ..., * , target=None
target = kwargs.get('target') or dict(subtitle_dict1)

for subtitle_dict in subtitle_dicts:
for lang in subtitle_dict:
target[lang] = cls._merge_subtitle_items(target.get(lang, []), subtitle_dict[lang])
return target

def extract_automatic_captions(self, *args, **kwargs):
if (self._downloader.params.get('writeautomaticsub', False)
Expand Down Expand Up @@ -3334,6 +3342,29 @@ def _generic_id(self, url):
def _generic_title(self, url):
return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])

def _yes_playlist(self, playlist_id, video_id, *args, **kwargs):
# smuggled_data=None, *, playlist_label='playlist', video_label='video'
smuggled_data = args[0] if len(args) == 1 else kwargs.get('smuggled_data')
playlist_label = kwargs.get('playlist_label', 'playlist')
video_label = kwargs.get('video_label', 'video')

if not playlist_id or not video_id:
return not video_id

no_playlist = (smuggled_data or {}).get('force_noplaylist')
if no_playlist is not None:
return not no_playlist

video_id = '' if video_id is True else ' ' + video_id
noplaylist = self.get_param('noplaylist')
self.to_screen(
'Downloading just the {0}{1} because of --no-playlist'.format(video_label, video_id)
if noplaylist else
'Downloading {0}{1} - add --no-playlist to download just the {2}{3}'.format(
playlist_label, '' if playlist_id is True else ' ' + playlist_id,
video_label, video_id))
return not noplaylist


class SearchInfoExtractor(InfoExtractor):
"""
Expand Down
10 changes: 10 additions & 0 deletions youtube_dl/traversal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# coding: utf-8

# TODO: move these utils.fns here and move import to utils
# flake8: noqa
from .utils import (
dict_get,
get_first,
T,
traverse_obj,
)
Loading
Loading