Skip to content

Commit

Permalink
fix: HTTP 308 Permanent Redirect status code handling (#2389)
Browse files Browse the repository at this point in the history
Change the handling of HTTP status code 308 to behave more like
`urllib.request.HTTPRedirectHandler`, most critically, the new 308 handling will
create a new `urllib.request.Request` object with the new URL, which will
prevent state from being carried over from the original request.

One case where this is important is when the domain name changes, for example,
when the original URL is `http://www.w3.org/ns/adms.ttl` and the redirect URL is
`https://uri.semic.eu/w3c/ns/adms.ttl`. With the previous behaviour, the redirect
would contain a `Host` header with the value `www.w3.org` instead of
`uri.semic.eu` because the `Host` header is placed in
`Request.unredirected_hdrs` and takes precedence over the `Host` header in
`Request.headers`.

Other changes:
- Only handle HTTP status code 308 on Python versions before 3.11 as Python 3.11
  will handle 308 by default [[ref](https://docs.python.org/3.11/whatsnew/changelog.html#id128)].
- Move code which uses `http://www.w3.org/ns/adms.ttl` and
  `http://www.w3.org/ns/adms.rdf` out of `test_guess_format_for_parse` into a
  separate parameterized test, which instead uses the embedded http server.

  This allows the test to fully control the `Content-Type` header in the
  response instead of relying on the value that the server is sending.

  This is needed because the server is sending `Content-Type: text/plain` for
  the `adms.ttl` file, which is not a valid RDF format, and the test is
  expecting `Content-Type: text/turtle`.

Fixes:
- <#2382>.
  • Loading branch information
aucampia authored May 17, 2023
1 parent 2c8d1e1 commit e0b3152
Show file tree
Hide file tree
Showing 15 changed files with 1,121 additions and 52 deletions.
117 changes: 117 additions & 0 deletions rdflib/_networking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from __future__ import annotations

import string
import sys
from typing import Dict
from urllib.error import HTTPError
from urllib.parse import quote as urlquote
from urllib.parse import urljoin, urlsplit
from urllib.request import HTTPRedirectHandler, Request, urlopen
from urllib.response import addinfourl


def _make_redirect_request(request: Request, http_error: HTTPError) -> Request:
"""
Create a new request object for a redirected request.
The logic is based on `urllib.request.HTTPRedirectHandler` from `this commit <https://github.com/python/cpython/blob/b58bc8c2a9a316891a5ea1a0487aebfc86c2793a/Lib/urllib/request.py#L641-L751>_`.
:param request: The original request that resulted in the redirect.
:param http_error: The response to the original request that indicates a
redirect should occur and contains the new location.
:return: A new request object to the location indicated by the response.
:raises HTTPError: the supplied ``http_error`` if the redirect request
cannot be created.
:raises ValueError: If the response code is `None`.
:raises ValueError: If the response does not contain a ``Location`` header
or the ``Location`` header is not a string.
:raises HTTPError: If the scheme of the new location is not ``http``,
``https``, or ``ftp``.
:raises HTTPError: If there are too many redirects or a redirect loop.
"""
new_url = http_error.headers.get("Location")
if new_url is None:
raise http_error
if not isinstance(new_url, str):
raise ValueError(f"Location header {new_url!r} is not a string")

new_url_parts = urlsplit(new_url)

# For security reasons don't allow redirection to anything other than http,
# https or ftp.
if new_url_parts.scheme not in ("http", "https", "ftp", ""):
raise HTTPError(
new_url,
http_error.code,
f"{http_error.reason} - Redirection to url {new_url!r} is not allowed",
http_error.headers,
http_error.fp,
)

# http.client.parse_headers() decodes as ISO-8859-1. Recover the original
# bytes and percent-encode non-ASCII bytes, and any special characters such
# as the space.
new_url = urlquote(new_url, encoding="iso-8859-1", safe=string.punctuation)
new_url = urljoin(request.full_url, new_url)

# XXX Probably want to forget about the state of the current
# request, although that might interact poorly with other
# handlers that also use handler-specific request attributes
content_headers = ("content-length", "content-type")
newheaders = {
k: v for k, v in request.headers.items() if k.lower() not in content_headers
}
new_request = Request(
new_url,
headers=newheaders,
origin_req_host=request.origin_req_host,
unverifiable=True,
)

visited: Dict[str, int]
if hasattr(request, "redirect_dict"):
visited = request.redirect_dict
if (
visited.get(new_url, 0) >= HTTPRedirectHandler.max_repeats
or len(visited) >= HTTPRedirectHandler.max_redirections
):
raise HTTPError(
request.full_url,
http_error.code,
HTTPRedirectHandler.inf_msg + http_error.reason,
http_error.headers,
http_error.fp,
)
else:
visited = {}
setattr(request, "redirect_dict", visited)

setattr(new_request, "redirect_dict", visited)
visited[new_url] = visited.get(new_url, 0) + 1
return new_request


def _urlopen(request: Request) -> addinfourl:
"""
This is a shim for `urlopen` that handles HTTP redirects with status code
308 (Permanent Redirect).
This function should be removed once all supported versions of Python
handles the 308 HTTP status code.
:param request: The request to open.
:return: The response to the request.
"""
try:
return urlopen(request)
except HTTPError as error:
if error.code == 308 and sys.version_info < (3, 11):
# HTTP response code 308 (Permanent Redirect) is not supported by python
# versions older than 3.11. See <https://bugs.python.org/issue40321> and
# <https://github.com/python/cpython/issues/84501> for more details.
# This custom error handling should be removed once all supported
# versions of Python handles 308.
new_request = _make_redirect_request(request, error)
return _urlopen(new_request)
else:
raise
19 changes: 2 additions & 17 deletions rdflib/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@
Tuple,
Union,
)
from urllib.error import HTTPError
from urllib.parse import urljoin
from urllib.request import Request, url2pathname, urlopen
from urllib.request import Request, url2pathname
from xml.sax import xmlreader

import rdflib.util
from rdflib import __version__
from rdflib._networking import _urlopen
from rdflib.namespace import Namespace
from rdflib.term import URIRef

Expand Down Expand Up @@ -267,21 +267,6 @@ def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None

req = Request(system_id, None, myheaders) # type: ignore[arg-type]

def _urlopen(req: Request) -> Any:
try:
return urlopen(req)
except HTTPError as ex:
# 308 (Permanent Redirect) is not supported by current python version(s)
# See https://bugs.python.org/issue40321
# This custom error handling should be removed once all
# supported versions of python support 308.
if ex.code == 308:
# type error: Incompatible types in assignment (expression has type "Optional[Any]", variable has type "str")
req.full_url = ex.headers.get("Location") # type: ignore[assignment]
return _urlopen(req)
else:
raise

response: addinfourl = _urlopen(req)
self.url = response.geturl() # in case redirections took place
self.links = self.get_links(response)
Expand Down
34 changes: 28 additions & 6 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,22 +44,44 @@ def rdfs_graph() -> Graph:
return Graph().parse(TEST_DATA_DIR / "defined_namespaces/rdfs.ttl", format="turtle")


_ServedBaseHTTPServerMocks = Tuple[ServedBaseHTTPServerMock, ServedBaseHTTPServerMock]


@pytest.fixture(scope="session")
def _session_function_httpmock() -> Generator[ServedBaseHTTPServerMock, None, None]:
def _session_function_httpmocks() -> Generator[_ServedBaseHTTPServerMocks, None, None]:
"""
This fixture is session scoped, but it is reset for each function in
:func:`function_httpmock`. This should not be used directly.
"""
with ServedBaseHTTPServerMock() as httpmock:
yield httpmock
with ServedBaseHTTPServerMock() as httpmock_a, ServedBaseHTTPServerMock() as httpmock_b:
yield httpmock_a, httpmock_b


@pytest.fixture(scope="function")
def function_httpmock(
_session_function_httpmock: ServedBaseHTTPServerMock,
_session_function_httpmocks: _ServedBaseHTTPServerMocks,
) -> Generator[ServedBaseHTTPServerMock, None, None]:
_session_function_httpmock.reset()
yield _session_function_httpmock
"""
HTTP server mock that is reset for each test function.
"""
(mock, _) = _session_function_httpmocks
mock.reset()
yield mock


@pytest.fixture(scope="function")
def function_httpmocks(
_session_function_httpmocks: _ServedBaseHTTPServerMocks,
) -> Generator[Tuple[ServedBaseHTTPServerMock, ServedBaseHTTPServerMock], None, None]:
"""
Alternative HTTP server mock that is reset for each test function.
This exists in case a tests needs to work with two different HTTP servers.
"""
(mock_a, mock_b) = _session_function_httpmocks
mock_a.reset()
mock_b.reset()
yield mock_a, mock_b


@pytest.fixture(scope="session", autouse=True)
Expand Down
18 changes: 18 additions & 0 deletions test/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pathlib import Path

from rdflib import URIRef
from rdflib.graph import Graph

TEST_DIR = Path(__file__).parent
TEST_DATA_DIR = TEST_DIR / "data"
Expand All @@ -19,3 +20,20 @@
context0 = URIRef("urn:example:context-0")
context1 = URIRef("urn:example:context-1")
context2 = URIRef("urn:example:context-2")


simple_triple_graph = Graph().add(
(
URIRef("http://example.org/subject"),
URIRef("http://example.org/predicate"),
URIRef("http://example.org/object"),
)
)
"""
A simple graph with a single triple. This is equivalent to the following RDF files:
* ``test/data/variants/simple_triple.nq``
* ``test/data/variants/simple_triple.nt``
* ``test/data/variants/simple_triple.ttl``
* ``test/data/variants/simple_triple.xml``
"""
Loading

0 comments on commit e0b3152

Please sign in to comment.