Skip to content

Commit

Permalink
[core] Improve HTTP redirect handling (#7094)
Browse files Browse the repository at this point in the history
Aligns HTTP redirect handling with what browsers commonly do and RFC standards. 

Fixes issues afac4ca missed.

Authored by: coletdjnz
  • Loading branch information
coletdjnz authored May 27, 2023
1 parent 66468bb commit 08916a4
Show file tree
Hide file tree
Showing 3 changed files with 283 additions and 74 deletions.
6 changes: 0 additions & 6 deletions test/test_YoutubeDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import copy
import json
import urllib.error

from test.helper import FakeYDL, assertRegexpMatches
from yt_dlp import YoutubeDL
Expand Down Expand Up @@ -1097,11 +1096,6 @@ def test_selection(params, expected_ids, evaluate_all=False):
test_selection({'playlist_items': '-15::2'}, INDICES[1::2], True)
test_selection({'playlist_items': '-15::15'}, [], True)

def test_urlopen_no_file_protocol(self):
# see https://github.com/ytdl-org/youtube-dl/issues/8227
ydl = YDL()
self.assertRaises(urllib.error.URLError, ydl.urlopen, 'file:///etc/passwd')

def test_do_not_override_ie_key_in_url_transparent(self):
ydl = YDL()

Expand Down
292 changes: 262 additions & 30 deletions test/test_http.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,40 +7,163 @@

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


import gzip
import http.cookiejar
import http.server
import io
import pathlib
import ssl
import tempfile
import threading
import urllib.error
import urllib.request

from test.helper import http_server_port
from yt_dlp import YoutubeDL
from yt_dlp.utils import sanitized_Request, urlencode_postdata

from .helper import FakeYDL

TEST_DIR = os.path.dirname(os.path.abspath(__file__))


class HTTPTestRequestHandler(http.server.BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'

def log_message(self, format, *args):
pass

def _headers(self):
payload = str(self.headers).encode('utf-8')
self.send_response(200)
self.send_header('Content-Type', 'application/json')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(payload)

def _redirect(self):
self.send_response(int(self.path[len('/redirect_'):]))
self.send_header('Location', '/method')
self.send_header('Content-Length', '0')
self.end_headers()

def _method(self, method, payload=None):
self.send_response(200)
self.send_header('Content-Length', str(len(payload or '')))
self.send_header('Method', method)
self.end_headers()
if payload:
self.wfile.write(payload)

def _status(self, status):
payload = f'<html>{status} NOT FOUND</html>'.encode()
self.send_response(int(status))
self.send_header('Content-Type', 'text/html; charset=utf-8')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(payload)

def _read_data(self):
if 'Content-Length' in self.headers:
return self.rfile.read(int(self.headers['Content-Length']))

def do_POST(self):
data = self._read_data()
if self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('POST', data)
elif self.path.startswith('/headers'):
self._headers()
else:
self._status(404)

def do_HEAD(self):
if self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('HEAD')
else:
self._status(404)

def do_PUT(self):
data = self._read_data()
if self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('PUT', data)
else:
self._status(404)

def do_GET(self):
if self.path == '/video.html':
payload = b'<html><video src="/vid.mp4" /></html>'
self.send_response(200)
self.send_header('Content-Type', 'text/html; charset=utf-8')
self.send_header('Content-Length', str(len(payload))) # required for persistent connections
self.end_headers()
self.wfile.write(b'<html><video src="/vid.mp4" /></html>')
self.wfile.write(payload)
elif self.path == '/vid.mp4':
payload = b'\x00\x00\x00\x00\x20\x66\x74[video]'
self.send_response(200)
self.send_header('Content-Type', 'video/mp4')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(b'\x00\x00\x00\x00\x20\x66\x74[video]')
self.wfile.write(payload)
elif self.path == '/%E4%B8%AD%E6%96%87.html':
payload = b'<html><video src="/vid.mp4" /></html>'
self.send_response(200)
self.send_header('Content-Type', 'text/html; charset=utf-8')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(payload)
elif self.path == '/%c7%9f':
payload = b'<html><video src="/vid.mp4" /></html>'
self.send_response(200)
self.send_header('Content-Type', 'text/html; charset=utf-8')
self.send_header('Content-Length', str(len(payload)))
self.end_headers()
self.wfile.write(payload)
elif self.path.startswith('/redirect_'):
self._redirect()
elif self.path.startswith('/method'):
self._method('GET')
elif self.path.startswith('/headers'):
self._headers()
elif self.path == '/trailing_garbage':
payload = b'<html><video src="/vid.mp4" /></html>'
self.send_response(200)
self.send_header('Content-Type', 'text/html; charset=utf-8')
self.send_header('Content-Encoding', 'gzip')
buf = io.BytesIO()
with gzip.GzipFile(fileobj=buf, mode='wb') as f:
f.write(payload)
compressed = buf.getvalue() + b'trailing garbage'
self.send_header('Content-Length', str(len(compressed)))
self.end_headers()
self.wfile.write(compressed)
elif self.path == '/302-non-ascii-redirect':
new_url = f'http://127.0.0.1:{http_server_port(self.server)}/中文.html'
self.send_response(301)
self.send_header('Location', new_url)
self.send_header('Content-Length', '0')
self.end_headers()
self.wfile.write(b'<html><video src="/vid.mp4" /></html>')
else:
assert False
self._status(404)

def send_header(self, keyword, value):
"""
Forcibly allow HTTP server to send non percent-encoded non-ASCII characters in headers.
This is against what is defined in RFC 3986, however we need to test we support this
since some sites incorrectly do this.
"""
if keyword.lower() == 'connection':
return super().send_header(keyword, value)

if not hasattr(self, '_headers_buffer'):
self._headers_buffer = []

self._headers_buffer.append(f'{keyword}: {value}\r\n'.encode())


class FakeLogger:
Expand All @@ -56,36 +179,128 @@ def error(self, msg):

class TestHTTP(unittest.TestCase):
def setUp(self):
self.httpd = http.server.HTTPServer(
# HTTP server
self.http_httpd = http.server.ThreadingHTTPServer(
('127.0.0.1', 0), HTTPTestRequestHandler)
self.port = http_server_port(self.httpd)
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
self.server_thread.daemon = True
self.server_thread.start()


class TestHTTPS(unittest.TestCase):
def setUp(self):
self.http_port = http_server_port(self.http_httpd)
self.http_server_thread = threading.Thread(target=self.http_httpd.serve_forever)
# FIXME: we should probably stop the http server thread after each test
# See: https://github.com/yt-dlp/yt-dlp/pull/7094#discussion_r1199746041
self.http_server_thread.daemon = True
self.http_server_thread.start()

# HTTPS server
certfn = os.path.join(TEST_DIR, 'testcert.pem')
self.httpd = http.server.HTTPServer(
self.https_httpd = http.server.ThreadingHTTPServer(
('127.0.0.1', 0), HTTPTestRequestHandler)
sslctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
sslctx.load_cert_chain(certfn, None)
self.httpd.socket = sslctx.wrap_socket(self.httpd.socket, server_side=True)
self.port = http_server_port(self.httpd)
self.server_thread = threading.Thread(target=self.httpd.serve_forever)
self.server_thread.daemon = True
self.server_thread.start()
self.https_httpd.socket = sslctx.wrap_socket(self.https_httpd.socket, server_side=True)
self.https_port = http_server_port(self.https_httpd)
self.https_server_thread = threading.Thread(target=self.https_httpd.serve_forever)
self.https_server_thread.daemon = True
self.https_server_thread.start()

def test_nocheckcertificate(self):
ydl = YoutubeDL({'logger': FakeLogger()})
self.assertRaises(
Exception,
ydl.extract_info, 'https://127.0.0.1:%d/video.html' % self.port)

ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True})
r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
with FakeYDL({'logger': FakeLogger()}) as ydl:
with self.assertRaises(urllib.error.URLError):
ydl.urlopen(sanitized_Request(f'https://127.0.0.1:{self.https_port}/headers'))

with FakeYDL({'logger': FakeLogger(), 'nocheckcertificate': True}) as ydl:
r = ydl.urlopen(sanitized_Request(f'https://127.0.0.1:{self.https_port}/headers'))
self.assertEqual(r.status, 200)
r.close()

def test_percent_encode(self):
with FakeYDL() as ydl:
# Unicode characters should be encoded with uppercase percent-encoding
res = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/中文.html'))
self.assertEqual(res.status, 200)
res.close()
# don't normalize existing percent encodings
res = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/%c7%9f'))
self.assertEqual(res.status, 200)
res.close()

def test_unicode_path_redirection(self):
with FakeYDL() as ydl:
r = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/302-non-ascii-redirect'))
self.assertEqual(r.url, f'http://127.0.0.1:{self.http_port}/%E4%B8%AD%E6%96%87.html')
r.close()

def test_redirect(self):
with FakeYDL() as ydl:
def do_req(redirect_status, method):
data = b'testdata' if method in ('POST', 'PUT') else None
res = ydl.urlopen(sanitized_Request(
f'http://127.0.0.1:{self.http_port}/redirect_{redirect_status}', method=method, data=data))
return res.read().decode('utf-8'), res.headers.get('method', '')

# A 303 must either use GET or HEAD for subsequent request
self.assertEqual(do_req(303, 'POST'), ('', 'GET'))
self.assertEqual(do_req(303, 'HEAD'), ('', 'HEAD'))

self.assertEqual(do_req(303, 'PUT'), ('', 'GET'))

# 301 and 302 turn POST only into a GET
self.assertEqual(do_req(301, 'POST'), ('', 'GET'))
self.assertEqual(do_req(301, 'HEAD'), ('', 'HEAD'))
self.assertEqual(do_req(302, 'POST'), ('', 'GET'))
self.assertEqual(do_req(302, 'HEAD'), ('', 'HEAD'))

self.assertEqual(do_req(301, 'PUT'), ('testdata', 'PUT'))
self.assertEqual(do_req(302, 'PUT'), ('testdata', 'PUT'))

# 307 and 308 should not change method
for m in ('POST', 'PUT'):
self.assertEqual(do_req(307, m), ('testdata', m))
self.assertEqual(do_req(308, m), ('testdata', m))

self.assertEqual(do_req(307, 'HEAD'), ('', 'HEAD'))
self.assertEqual(do_req(308, 'HEAD'), ('', 'HEAD'))

# These should not redirect and instead raise an HTTPError
for code in (300, 304, 305, 306):
with self.assertRaises(urllib.error.HTTPError):
do_req(code, 'GET')

def test_content_type(self):
# https://github.com/yt-dlp/yt-dlp/commit/379a4f161d4ad3e40932dcf5aca6e6fb9715ab28
with FakeYDL({'nocheckcertificate': True}) as ydl:
# method should be auto-detected as POST
r = sanitized_Request(f'https://localhost:{self.https_port}/headers', data=urlencode_postdata({'test': 'test'}))

headers = ydl.urlopen(r).read().decode('utf-8')
self.assertIn('Content-Type: application/x-www-form-urlencoded', headers)

# test http
r = sanitized_Request(f'http://localhost:{self.http_port}/headers', data=urlencode_postdata({'test': 'test'}))
headers = ydl.urlopen(r).read().decode('utf-8')
self.assertIn('Content-Type: application/x-www-form-urlencoded', headers)

def test_cookiejar(self):
with FakeYDL() as ydl:
ydl.cookiejar.set_cookie(http.cookiejar.Cookie(
0, 'test', 'ytdlp', None, False, '127.0.0.1', True,
False, '/headers', True, False, None, False, None, None, {}))
data = ydl.urlopen(sanitized_Request(f'http://127.0.0.1:{self.http_port}/headers')).read()
self.assertIn(b'Cookie: test=ytdlp', data)

def test_no_compression_compat_header(self):
with FakeYDL() as ydl:
data = ydl.urlopen(
sanitized_Request(
f'http://127.0.0.1:{self.http_port}/headers',
headers={'Youtubedl-no-compression': True})).read()
self.assertIn(b'Accept-Encoding: identity', data)
self.assertNotIn(b'youtubedl-no-compression', data.lower())

def test_gzip_trailing_garbage(self):
# https://github.com/ytdl-org/youtube-dl/commit/aa3e950764337ef9800c936f4de89b31c00dfcf5
# https://github.com/ytdl-org/youtube-dl/commit/6f2ec15cee79d35dba065677cad9da7491ec6e6f
with FakeYDL() as ydl:
data = ydl.urlopen(sanitized_Request(f'http://localhost:{self.http_port}/trailing_garbage')).read().decode('utf-8')
self.assertEqual(data, '<html><video src="/vid.mp4" /></html>')


class TestClientCert(unittest.TestCase):
Expand All @@ -112,8 +327,8 @@ def _run_test(self, **params):
'nocheckcertificate': True,
**params,
})
r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port)
self.assertEqual(r['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port)
r = ydl.extract_info(f'https://127.0.0.1:{self.port}/video.html')
self.assertEqual(r['url'], f'https://127.0.0.1:{self.port}/vid.mp4')

def test_certificate_combined_nopass(self):
self._run_test(client_certificate=os.path.join(self.certdir, 'clientwithkey.crt'))
Expand Down Expand Up @@ -188,5 +403,22 @@ def test_proxy_with_idn(self):
self.assertEqual(response, 'normal: http://xn--fiq228c.tw/')


class TestFileURL(unittest.TestCase):
# See https://github.com/ytdl-org/youtube-dl/issues/8227
def test_file_urls(self):
tf = tempfile.NamedTemporaryFile(delete=False)
tf.write(b'foobar')
tf.close()
url = pathlib.Path(tf.name).as_uri()
with FakeYDL() as ydl:
self.assertRaisesRegex(
urllib.error.URLError, 'file:// URLs are explicitly disabled in yt-dlp for security reasons', ydl.urlopen, url)
with FakeYDL({'enable_file_urls': True}) as ydl:
res = ydl.urlopen(url)
self.assertEqual(res.read(), b'foobar')
res.close()
os.unlink(tf.name)


if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 08916a4

Please sign in to comment.