From f33e312c58c2eebaa5fbd4bcaac5dbe32ddfe156 Mon Sep 17 00:00:00 2001 From: df Date: Sun, 10 Oct 2021 12:42:51 +0100 Subject: [PATCH] Detect extension from any RFC Content-Disposition syntax Add support for unquoted token and RFC 5987 extended parameter syntax --- test/test_utils.py | 21 +++++++++++++++++++++ youtube_dl/utils.py | 20 +++++++++++++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 14607f6b8cb..a798d8553d8 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1505,6 +1505,27 @@ def headers(self): 'Content-Type': b'audio/mp3', }) self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and unquoted filename + urlh = UrlHandle({ + 'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition including spacing and uppercase + urlh = UrlHandle({ + 'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and extended filename parameter syntax + urlh = UrlHandle({ + 'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3", + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with Content-Disposition and both filename parameter syntaxes + urlh = UrlHandle({ + 'Content-Disposition': b'''attachment; filename="should ignore.mp4"; + FileName* = iso8859-15''costs%201%A4%20filename.mp3''', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') if __name__ == '__main__': diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 90eb9f93c44..85469cf276e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -58,9 +58,10 @@ compat_struct_unpack, compat_urllib_error, compat_urllib_parse, + compat_urllib_parse_unquote, + compat_urllib_parse_unquote_plus, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urllib_parse_unquote_plus, compat_urllib_request, compat_urlparse, compat_xpath, @@ -4309,9 +4310,22 @@ def encode_compat_str_or_none(x, encoding='iso-8859-1', errors='ignore'): cd = encode_compat_str_or_none(getheader('Content-Disposition')) if cd: - m = re.match(r'attachment;\s*filename="(?P[^"]+)"', cd) + m = re.match(r'''(?xi) + attachment;\s* + (?:filename\s*=[^;]+?;\s*)? # possible initial filename=...;, ignored + filename(?P\*)?\s*=\s* # filename/filename* = + (?(x)(?P\S+?)'[\w-]*'|(?P")?) # if * then charset'...' else maybe " + (?P(?(q)[^"]+(?=")|\S+)) # actual name of file + ''', cd) if m: - e = determine_ext(m.group('filename'), default_ext=None) + m = m.groupdict() + filename = m.get('filename') + if m.get('x'): + try: + filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8')) + except LookupError: # unrecognised character set name + pass + e = determine_ext(filename, default_ext=None) if e: return e