Skip to content

Commit

Permalink
Detect extension from any RFC Content-Disposition syntax
Browse files Browse the repository at this point in the history
Add support for unquoted token and RFC 5987 extended parameter syntax
  • Loading branch information
dirkf committed Oct 10, 2021
1 parent f798b40 commit f33e312
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 3 deletions.
21 changes: 21 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1505,6 +1505,27 @@ def headers(self):
'Content-Type': b'audio/mp3',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition and unquoted filename
urlh = UrlHandle({
'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition including spacing and uppercase
urlh = UrlHandle({
'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition and extended filename parameter syntax
urlh = UrlHandle({
'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3",
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
# header with Content-Disposition and both filename parameter syntaxes
urlh = UrlHandle({
'Content-Disposition': b'''attachment; filename="should ignore.mp4";
FileName* = iso8859-15''costs%201%A4%20filename.mp3''',
})
self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')


if __name__ == '__main__':
Expand Down
20 changes: 17 additions & 3 deletions youtube_dl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,10 @@
compat_struct_unpack,
compat_urllib_error,
compat_urllib_parse,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_urllib_parse_urlencode,
compat_urllib_parse_urlparse,
compat_urllib_parse_unquote_plus,
compat_urllib_request,
compat_urlparse,
compat_xpath,
Expand Down Expand Up @@ -4309,9 +4310,22 @@ def encode_compat_str_or_none(x, encoding='iso-8859-1', errors='ignore'):

cd = encode_compat_str_or_none(getheader('Content-Disposition'))
if cd:
m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
m = re.match(r'''(?xi)
attachment;\s*
(?:filename\s*=[^;]+?;\s*)? # possible initial filename=...;, ignored
filename(?P<x>\*)?\s*=\s* # filename/filename* =
(?(x)(?P<charset>\S+?)'[\w-]*'|(?P<q>")?) # if * then charset'...' else maybe "
(?P<filename>(?(q)[^"]+(?=")|\S+)) # actual name of file
''', cd)
if m:
e = determine_ext(m.group('filename'), default_ext=None)
m = m.groupdict()
filename = m.get('filename')
if m.get('x'):
try:
filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8'))
except LookupError: # unrecognised character set name
pass
e = determine_ext(filename, default_ext=None)
if e:
return e

Expand Down

0 comments on commit f33e312

Please sign in to comment.