Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PR #8074/33ccdfb0 backport][3.9] Improve validation in HTTP parser #8078

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES/8074.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fixed an unhandled exception in the Python HTTP parser on header lines starting with a colon -- by :user:`pajod`.

Invalid request lines with anything but a dot between the HTTP major and minor version are now rejected. Invalid header field names containing question mark or slash are now rejected. Such requests are incompatible with :rfc:`9110#section-5.6.2` and are not known to be of any legitimate use.

(BACKWARD INCOMPATIBLE)
1 change: 1 addition & 0 deletions CONTRIBUTORS.txt
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ Pankaj Pandey
Parag Jain
Pau Freixes
Paul Colomiets
Paul J. Dorn
Paulius Šileikis
Paulus Schoutsen
Pavel Kamaev
Expand Down
32 changes: 18 additions & 14 deletions aiohttp/http_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,11 @@
# tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." /
# "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA
# token = 1*tchar
METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
HDRRE: Final[Pattern[bytes]] = re.compile(
rb"[\x00-\x1F\x7F-\xFF()<>@,;:\[\]={} \t\"\\]"
)
HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
_TCHAR_SPECIALS: Final[str] = re.escape("!#$%&'*+-.^_`|~")
TOKENRE: Final[Pattern[str]] = re.compile(f"[0-9A-Za-z{_TCHAR_SPECIALS}]+")
VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d)\.(\d)", re.ASCII)
DIGITS: Final[Pattern[str]] = re.compile(r"\d+", re.ASCII)
HEXDIGITS: Final[Pattern[bytes]] = re.compile(rb"[0-9a-fA-F]+")


class RawRequestMessage(NamedTuple):
Expand Down Expand Up @@ -136,6 +135,7 @@
self, lines: List[bytes]
) -> Tuple["CIMultiDictProxy[str]", RawHeaders]:
headers: CIMultiDict[str] = CIMultiDict()
# note: "raw" does not mean inclusion of OWS before/after the field value
raw_headers = []

lines_idx = 1
Expand All @@ -149,13 +149,14 @@
except ValueError:
raise InvalidHeader(line) from None

if len(bname) == 0:
raise InvalidHeader(bname)

Check warning on line 153 in aiohttp/http_parser.py

View check run for this annotation

Codecov / codecov/patch

aiohttp/http_parser.py#L153

Added line #L153 was not covered by tests

# https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
if {bname[0], bname[-1]} & {32, 9}: # {" ", "\t"}
raise InvalidHeader(line)

bvalue = bvalue.lstrip(b" \t")
if HDRRE.search(bname):
raise InvalidHeader(bname)
if len(bname) > self.max_field_size:
raise LineTooLong(
"request header name {}".format(
Expand All @@ -164,6 +165,9 @@
str(self.max_field_size),
str(len(bname)),
)
name = bname.decode("utf-8", "surrogateescape")
if not TOKENRE.fullmatch(name):
raise InvalidHeader(bname)

header_length = len(bvalue)

Expand Down Expand Up @@ -210,7 +214,6 @@
)

bvalue = bvalue.strip(b" \t")
name = bname.decode("utf-8", "surrogateescape")
value = bvalue.decode("utf-8", "surrogateescape")

# https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
Expand Down Expand Up @@ -338,7 +341,8 @@

# Shouldn't allow +/- or other number formats.
# https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
if not length_hdr.strip(" \t").isdecimal():
# msg.headers is already stripped of leading/trailing wsp
if not DIGITS.fullmatch(length_hdr):
raise InvalidHeader(CONTENT_LENGTH)

return int(length_hdr)
Expand Down Expand Up @@ -566,7 +570,7 @@
)

# method
if not METHRE.fullmatch(method):
if not TOKENRE.fullmatch(method):
raise BadStatusLine(method)

# version
Expand Down Expand Up @@ -683,8 +687,8 @@
raise BadStatusLine(line)
version_o = HttpVersion(int(match.group(1)), int(match.group(2)))

# The status code is a three-digit number
if len(status) != 3 or not status.isdecimal():
# The status code is a three-digit ASCII number, no padding
if len(status) != 3 or not DIGITS.fullmatch(status):
raise BadStatusLine(line)
status_i = int(status)

Expand Down Expand Up @@ -826,7 +830,7 @@
if self._lax: # Allow whitespace in lax mode.
size_b = size_b.strip()

if not re.fullmatch(HEXDIGIT, size_b):
if not re.fullmatch(HEXDIGITS, size_b):
exc = TransferEncodingError(
chunk[:pos].decode("ascii", "surrogateescape")
)
Expand Down
139 changes: 136 additions & 3 deletions tests/test_http_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import asyncio
import re
from typing import Any, List
from contextlib import nullcontext
from typing import Any, Dict, List
from unittest import mock
from urllib.parse import quote

Expand Down Expand Up @@ -169,11 +170,27 @@
parser.feed_data(text)


@pytest.mark.parametrize(
"rfc9110_5_6_2_token_delim",
r'"(),/:;<=>?@[\]{}',
)
def test_bad_header_name(parser: Any, rfc9110_5_6_2_token_delim: str) -> None:
text = f"POST / HTTP/1.1\r\nhead{rfc9110_5_6_2_token_delim}er: val\r\n\r\n".encode()
expectation = pytest.raises(http_exceptions.BadHttpMessage)
if rfc9110_5_6_2_token_delim == ":":
# Inserting colon into header just splits name/value earlier.
expectation = nullcontext()
with expectation:
parser.feed_data(text)


@pytest.mark.parametrize(
"hdr",
(
"Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
"Content-Length: +256",
"Content-Length: \N{superscript one}",
"Content-Length: \N{mathematical double-struck digit one}",
"Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
"Bar: abc\ndef",
"Baz: abc\x00def",
Expand Down Expand Up @@ -266,6 +283,20 @@
parser.feed_data(text)


def test_parse_unusual_request_line(parser) -> None:
if not isinstance(response, HttpResponseParserPy):
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
text = b"#smol //a HTTP/1.3\r\n\r\n"
messages, upgrade, tail = parser.feed_data(text)
assert len(messages) == 1
msg, _ = messages[0]
assert msg.compression is None
assert not msg.upgrade
assert msg.method == "#smol"
assert msg.path == "//a"
assert msg.version == (1, 3)

Check warning on line 297 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L289-L297

Added lines #L289 - L297 were not covered by tests


def test_parse(parser) -> None:
text = b"GET /test HTTP/1.1\r\n\r\n"
messages, upgrade, tail = parser.feed_data(text)
Expand Down Expand Up @@ -568,6 +599,43 @@
parser.feed_data(text)


_pad: Dict[bytes, str] = {
b"": "empty",
# not a typo. Python likes triple zero
b"\000": "NUL",
b" ": "SP",
b" ": "SPSP",
# not a typo: both 0xa0 and 0x0a in case of 8-bit fun
b"\n": "LF",
b"\xa0": "NBSP",
b"\t ": "TABSP",
}


@pytest.mark.parametrize("hdr", [b"", b"foo"], ids=["name-empty", "with-name"])
@pytest.mark.parametrize("pad2", _pad.keys(), ids=["post-" + n for n in _pad.values()])
@pytest.mark.parametrize("pad1", _pad.keys(), ids=["pre-" + n for n in _pad.values()])
def test_invalid_header_spacing(parser, pad1: bytes, pad2: bytes, hdr: bytes) -> None:
text = b"GET /test HTTP/1.1\r\n" b"%s%s%s: value\r\n\r\n" % (pad1, hdr, pad2)
expectation = pytest.raises(http_exceptions.BadHttpMessage)
if pad1 == pad2 == b"" and hdr != b"":
# one entry in param matrix is correct: non-empty name, not padded
expectation = nullcontext()
if pad1 == pad2 == hdr == b"":
if not isinstance(response, HttpResponseParserPy):
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
with expectation:
parser.feed_data(text)


def test_empty_header_name(parser) -> None:
if not isinstance(response, HttpResponseParserPy):
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
text = b"GET /test HTTP/1.1\r\n" b":test\r\n\r\n"

Check warning on line 634 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L634

Added line #L634 was not covered by tests
with pytest.raises(http_exceptions.BadHttpMessage):
parser.feed_data(text)

Check warning on line 636 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L636

Added line #L636 was not covered by tests


def test_invalid_header(parser) -> None:
text = b"GET /test HTTP/1.1\r\n" b"test line\r\n\r\n"
with pytest.raises(http_exceptions.BadHttpMessage):
Expand Down Expand Up @@ -690,6 +758,34 @@
assert r"\n" not in exc_info.value.message


_num: Dict[bytes, str] = {
# dangerous: accepted by Python int()
# unicodedata.category("\U0001D7D9") == 'Nd'
"\N{mathematical double-struck digit one}".encode(): "utf8digit",
# only added for interop tests, refused by Python int()
# unicodedata.category("\U000000B9") == 'No'
"\N{superscript one}".encode(): "utf8number",
"\N{superscript one}".encode("latin-1"): "latin1number",
}


@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
def test_http_request_bad_status_line_number(
parser: Any, nonascii_digit: bytes
) -> None:
text = b"GET /digit HTTP/1." + nonascii_digit + b"\r\n\r\n"
with pytest.raises(http_exceptions.BadStatusLine):
parser.feed_data(text)


def test_http_request_bad_status_line_separator(parser: Any) -> None:
# single code point, old, multibyte NFKC, multibyte NFKD
utf8sep = "\N{arabic ligature sallallahou alayhe wasallam}".encode()
text = b"GET /ligature HTTP/1" + utf8sep + b"1\r\n\r\n"
with pytest.raises(http_exceptions.BadStatusLine):
parser.feed_data(text)


def test_http_request_bad_status_line_whitespace(parser: Any) -> None:
text = b"GET\n/path\fHTTP/1.1\r\n\r\n"
with pytest.raises(http_exceptions.BadStatusLine):
Expand All @@ -711,6 +807,31 @@
assert tail == b"some raw data"


def test_http_request_parser_utf8_request_line(parser) -> None:
if not isinstance(response, HttpResponseParserPy):
pytest.xfail("Regression test for Py parser. May match C behaviour later.")
messages, upgrade, tail = parser.feed_data(

Check warning on line 813 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L813

Added line #L813 was not covered by tests
# note the truncated unicode sequence
b"GET /P\xc3\xbcnktchen\xa0\xef\xb7 HTTP/1.1\r\n" +
# for easier grep: ASCII 0xA0 more commonly known as non-breaking space
# note the leading and trailing spaces
"sTeP: \N{latin small letter sharp s}nek\t\N{no-break space} "
"\r\n\r\n".encode()
)
msg = messages[0][0]

Check warning on line 821 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L821

Added line #L821 was not covered by tests

assert msg.method == "GET"
assert msg.path == "/Pünktchen\udca0\udcef\udcb7"
assert msg.version == (1, 1)
assert msg.headers == CIMultiDict([("STEP", "ßnek\t\xa0")])
assert msg.raw_headers == ((b"sTeP", "ßnek\t\xa0".encode()),)
assert not msg.should_close
assert msg.compression is None
assert not msg.upgrade
assert not msg.chunked
assert msg.url.path == URL("/P%C3%BCnktchen\udca0\udcef\udcb7").path

Check warning on line 832 in tests/test_http_parser.py

View check run for this annotation

Codecov / codecov/patch

tests/test_http_parser.py#L823-L832

Added lines #L823 - L832 were not covered by tests


def test_http_request_parser_utf8(parser) -> None:
text = "GET /path HTTP/1.1\r\nx-test:тест\r\n\r\n".encode()
messages, upgrade, tail = parser.feed_data(text)
Expand Down Expand Up @@ -760,9 +881,15 @@
assert not msg.chunked


def test_http_request_parser_bad_method(parser) -> None:
@pytest.mark.parametrize(
"rfc9110_5_6_2_token_delim",
[bytes([i]) for i in rb'"(),/:;<=>?@[\]{}'],
)
def test_http_request_parser_bad_method(
parser, rfc9110_5_6_2_token_delim: bytes
) -> None:
with pytest.raises(http_exceptions.BadStatusLine):
parser.feed_data(b'G=":<>(e),[T];?" /get HTTP/1.1\r\n\r\n')
parser.feed_data(rfc9110_5_6_2_token_delim + b'ET" /get HTTP/1.1\r\n\r\n')


def test_http_request_parser_bad_version(parser) -> None:
Expand Down Expand Up @@ -974,6 +1101,12 @@
response.feed_data(b"HTTP/1.1 ttt test\r\n\r\n")


@pytest.mark.parametrize("nonascii_digit", _num.keys(), ids=_num.values())
def test_http_response_parser_code_not_ascii(response, nonascii_digit: bytes) -> None:
with pytest.raises(http_exceptions.BadStatusLine):
response.feed_data(b"HTTP/1.1 20" + nonascii_digit + b" test\r\n\r\n")


def test_http_request_chunked_payload(parser) -> None:
text = b"GET /test HTTP/1.1\r\n" b"transfer-encoding: chunked\r\n\r\n"
msg, payload = parser.feed_data(text)[0][0]
Expand Down
Loading