From 1e222005ba6bb9c288dd3a2b777ca5664b220c81 Mon Sep 17 00:00:00 2001 From: df Date: Sun, 29 Aug 2021 05:34:20 +0100 Subject: [PATCH] Fix urlhandle_detect_ext() non-ASCII error in Py2, with test --- test/test_utils.py | 25 +++++++++++++++++++++++++ youtube_dl/utils.py | 8 ++++++-- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index 6f894579277..44a4f6ff7ef 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -105,6 +105,7 @@ cli_valueless_option, cli_bool_option, parse_codecs, + urlhandle_detect_ext, ) from youtube_dl.compat import ( compat_chr, @@ -1475,6 +1476,30 @@ def test_clean_podcast_url(self): self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + def test_urlhandle_detect_ext(self): + + class UrlHandle(object): + _info = {} + + def __init__(self, info): + self._info = info + + @property + def headers(self): + return self._info + + # header with non-ASCII character and contradictory Content-Type + urlh = UrlHandle({ + 'Content-Disposition': b'attachment; filename="Epis\xf3dio contains non-ASCI ISO 8859-1 character.mp3"', + 'Content-Type': b'audio/aac', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + # header with no Content-Disposition + urlh = UrlHandle({ + 'Content-Type': b'audio/mp3', + }) + self.assertEqual(urlhandle_detect_ext(urlh), 'mp3') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index eaf86bb441d..5dde9768daa 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -4288,7 +4288,10 @@ def parse_codecs(codecs_str): def urlhandle_detect_ext(url_handle): getheader = url_handle.headers.get - cd = getheader('Content-Disposition') + def encode_compat_str_or_none(x, encoding='iso-8859-1', errors='ignore'): + return encode_compat_str(x, encoding=encoding, errors=errors) if x else None + + cd = encode_compat_str_or_none(getheader('Content-Disposition')) if cd: m = re.match(r'attachment;\s*filename="(?P[^"]+)"', cd) if m: @@ -4296,7 +4299,8 @@ def urlhandle_detect_ext(url_handle): if e: return e - return mimetype2ext(getheader('Content-Type')) + ct = encode_compat_str_or_none(getheader('Content-Type')) + return mimetype2ext(ct) def encode_data_uri(data, mime_type):