From f33e312c58c2eebaa5fbd4bcaac5dbe32ddfe156 Mon Sep 17 00:00:00 2001
From: df <fieldhouse@gmx.net>
Date: Sun, 10 Oct 2021 12:42:51 +0100
Subject: [PATCH] Detect extension from any RFC Content-Disposition syntax

Add support for unquoted token and RFC 5987 extended parameter syntax
---
 test/test_utils.py  | 21 +++++++++++++++++++++
 youtube_dl/utils.py | 20 +++++++++++++++++---
 2 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/test/test_utils.py b/test/test_utils.py
index 14607f6b8cb..a798d8553d8 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1505,6 +1505,27 @@ def headers(self):
             'Content-Type': b'audio/mp3',
         })
         self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and unquoted filename
+        urlh = UrlHandle({
+            'Content-Disposition': b'attachment; filename=unquoted_filename_token.mp3',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition including spacing and uppercase
+        urlh = UrlHandle({
+            'Content-Disposition': b'ATTACHMENT; FileName = unquoted_filename_token.mp3',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and extended filename parameter syntax
+        urlh = UrlHandle({
+            'Content-Disposition': b"attachment; filename*=iso8859-15''costs%201%A4%20filename.mp3",
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
+        # header with Content-Disposition and both filename parameter syntaxes
+        urlh = UrlHandle({
+            'Content-Disposition': b'''attachment; filename="should ignore.mp4";
+             FileName* = iso8859-15''costs%201%A4%20filename.mp3''',
+        })
+        self.assertEqual(urlhandle_detect_ext(urlh), 'mp3')
 
 
 if __name__ == '__main__':
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 90eb9f93c44..85469cf276e 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -58,9 +58,10 @@
     compat_struct_unpack,
     compat_urllib_error,
     compat_urllib_parse,
+    compat_urllib_parse_unquote,
+    compat_urllib_parse_unquote_plus,
     compat_urllib_parse_urlencode,
     compat_urllib_parse_urlparse,
-    compat_urllib_parse_unquote_plus,
     compat_urllib_request,
     compat_urlparse,
     compat_xpath,
@@ -4309,9 +4310,22 @@ def encode_compat_str_or_none(x, encoding='iso-8859-1', errors='ignore'):
 
     cd = encode_compat_str_or_none(getheader('Content-Disposition'))
     if cd:
-        m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
+        m = re.match(r'''(?xi)
+            attachment;\s*
+            (?:filename\s*=[^;]+?;\s*)?                    # possible initial filename=...;, ignored
+            filename(?P<x>\*)?\s*=\s*                      # filename/filename* =
+                (?(x)(?P<charset>\S+?)'[\w-]*'|(?P<q>")?)  # if * then charset'...' else maybe "
+                (?P<filename>(?(q)[^"]+(?=")|\S+))         # actual name of file
+            ''', cd)
         if m:
-            e = determine_ext(m.group('filename'), default_ext=None)
+            m = m.groupdict()
+            filename = m.get('filename')
+            if m.get('x'):
+                try:
+                    filename = compat_urllib_parse_unquote(filename, encoding=m.get('charset', 'utf-8'))
+                except LookupError:  # unrecognised character set name
+                    pass
+            e = determine_ext(filename, default_ext=None)
             if e:
                 return e