-
Notifications
You must be signed in to change notification settings - Fork 42
Fix percent decoding and normalization #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
28e43d7
66ef4be
ddec148
9ef30bc
027c8c6
d64b1e7
2075d28
31af281
0e877ec
c5fa760
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,8 +17,8 @@ | |
|
||
import re | ||
import string | ||
|
||
import socket | ||
from unicodedata import normalize | ||
try: | ||
from socket import inet_pton | ||
except ImportError: | ||
|
@@ -52,13 +52,6 @@ def inet_pton(address_family, ip_string): | |
raise socket.error('unknown address family') | ||
|
||
|
||
try: | ||
from urllib import unquote as urlunquote | ||
except ImportError: | ||
from urllib.parse import unquote_to_bytes as urlunquote | ||
|
||
from unicodedata import normalize | ||
|
||
unicode = type(u'') | ||
try: | ||
unichr | ||
|
@@ -141,7 +134,6 @@ def __nonzero__(self): | |
for a in string.hexdigits for b in string.hexdigits]) | ||
_ASCII_RE = re.compile('([\x00-\x7f]+)') | ||
|
||
|
||
# RFC 3986 section 2.2, Reserved Characters | ||
# https://tools.ietf.org/html/rfc3986#section-2.2 | ||
_GEN_DELIMS = frozenset(u':/?#[]@') | ||
|
@@ -160,6 +152,19 @@ def __nonzero__(self): | |
_QUERY_DELIMS = _ALL_DELIMS - _QUERY_SAFE | ||
|
||
|
||
def _make_decode_map(delims, allow_percent=False): | ||
ret = dict(_HEX_CHAR_MAP) | ||
if not allow_percent: | ||
delims = set(delims) | set([u'%']) | ||
for delim in delims: | ||
_hexord = hex(ord(delim))[2:].zfill(2).encode('ascii').upper() | ||
_hexord_lower = _hexord.lower() | ||
ret.pop(_hexord) | ||
if _hexord != _hexord_lower: | ||
ret.pop(_hexord_lower) | ||
return ret | ||
|
||
|
||
def _make_quote_map(safe_chars): | ||
ret = {} | ||
# v is included in the dict for py3 mostly, because bytestrings | ||
|
@@ -174,10 +179,14 @@ def _make_quote_map(safe_chars): | |
|
||
|
||
_USERINFO_PART_QUOTE_MAP = _make_quote_map(_USERINFO_SAFE) | ||
_USERINFO_DECODE_MAP = _make_decode_map(_USERINFO_DELIMS) | ||
_PATH_PART_QUOTE_MAP = _make_quote_map(_PATH_SAFE) | ||
_SCHEMELESS_PATH_PART_QUOTE_MAP = _make_quote_map(_SCHEMELESS_PATH_SAFE) | ||
_PATH_DECODE_MAP = _make_decode_map(_PATH_DELIMS) | ||
_QUERY_PART_QUOTE_MAP = _make_quote_map(_QUERY_SAFE) | ||
_QUERY_DECODE_MAP = _make_decode_map(_QUERY_DELIMS) | ||
_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE) | ||
_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS) | ||
|
||
_ROOT_PATHS = frozenset(((), (u'',))) | ||
|
||
|
@@ -276,6 +285,7 @@ def _encode_userinfo_part(text, maximal=True): | |
else t for t in text]) | ||
|
||
|
||
|
||
# This port list painstakingly curated by hand searching through | ||
# https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml | ||
# and | ||
|
@@ -413,23 +423,68 @@ def _textcheck(name, value, delims=frozenset(), nullable=False): | |
return value | ||
|
||
|
||
def _percent_decode(text): | ||
""" | ||
Replace percent-encoded characters with their UTF-8 equivalents. | ||
def _decode_userinfo_part(text): | ||
return _percent_decode(text, _decode_map=_USERINFO_DECODE_MAP) | ||
|
||
|
||
def _decode_path_part(text): | ||
return _percent_decode(text, _decode_map=_PATH_DECODE_MAP) | ||
|
||
|
||
def _decode_query_part(text): | ||
return _percent_decode(text, _decode_map=_QUERY_DECODE_MAP) | ||
|
||
|
||
def _decode_fragment_part(text): | ||
return _percent_decode(text, _decode_map=_FRAGMENT_DECODE_MAP) | ||
|
||
|
||
def _percent_decode(text, _decode_map=_HEX_CHAR_MAP): | ||
"""Convert percent-encoded text characters to their normal, | ||
human-readable equivalents. | ||
|
||
All characters in the input text must be valid ASCII. All special | ||
characters underlying the values in the percent-encoding must be | ||
valid UTF-8. | ||
|
||
Only called by field-tailored variants, e.g., | ||
:func:`_decode_path_part`, as every percent-encodable part of the | ||
URL has characters which should not be percent decoded. | ||
|
||
>>> _percent_decode(u'abc%20def') | ||
u'abc def' | ||
|
||
Args: | ||
text (unicode): The text with percent-encoded UTF-8 in it. | ||
text (unicode): The ASCII text with percent-encoding present. | ||
|
||
Returns: | ||
unicode: The encoded version of *text*. | ||
unicode: The percent-decoded version of *text*, with UTF-8 | ||
decoding applied. | ||
""" | ||
try: | ||
quotedBytes = text.encode("ascii") | ||
quoted_bytes = text.encode("ascii") | ||
except UnicodeEncodeError: | ||
return text | ||
unquotedBytes = urlunquote(quotedBytes) | ||
|
||
bits = quoted_bytes.split(b'%') | ||
if len(bits) == 1: | ||
return text | ||
|
||
res = [bits[0]] | ||
append = res.append | ||
|
||
for item in bits[1:]: | ||
try: | ||
append(_decode_map[item[:2]]) | ||
append(item[2:]) | ||
except KeyError: | ||
append(b'%') | ||
append(item) | ||
|
||
unquoted_bytes = b''.join(res) | ||
|
||
try: | ||
return unquotedBytes.decode("utf-8") | ||
return unquoted_bytes.decode("utf-8") | ||
except UnicodeDecodeError: | ||
return text | ||
|
||
|
@@ -819,7 +874,8 @@ def absolute(self): | |
return bool(self.scheme and self.host) | ||
|
||
def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, | ||
fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET): | ||
fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET, | ||
family=_UNSET, uses_netloc=_UNSET): | ||
""":class:`URL` objects are immutable, which means that attributes | ||
are designed to be set only once, at construction. Instead of | ||
modifying an existing URL, one simply creates a copy with the | ||
|
@@ -861,6 +917,8 @@ def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET, | |
port=_optional(port, self.port), | ||
rooted=_optional(rooted, self.rooted), | ||
userinfo=_optional(userinfo, self.userinfo), | ||
family=_optional(family, self.family), | ||
uses_netloc=_optional(uses_netloc, self.uses_netloc) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems incorrect. You shouldn't be able to specify There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Confirming this, deleting these added lines doesn't make any tests fail; if it should be here at all, it shouldn't be in this change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Functionally, Logistically, I can't remember if I intended to include it in this PR or if I just saw the big regression of not passing through There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it's worth working it's worth testing :-). Urgency notwithstanding it seems like an unrelated change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, the branch started out as seeking to normalize decoding in general, but it became pretty percent-decoding centric, as idna improvements will come in another PR. I've been brainstorming other ways to go with some of these latter-day arguments, but for now I think I prefer bringing things into sync, so I've added a test. |
||
) | ||
|
||
@classmethod | ||
|
@@ -1095,7 +1153,7 @@ def to_iri(self): | |
URL: A new instance with its path segments, query parameters, and | ||
hostname decoded for display purposes. | ||
""" | ||
new_userinfo = u':'.join([_percent_decode(p) for p in | ||
new_userinfo = u':'.join([_decode_userinfo_part(p) for p in | ||
self.userinfo.split(':', 1)]) | ||
try: | ||
asciiHost = self.host.encode("ascii") | ||
|
@@ -1109,13 +1167,13 @@ def to_iri(self): | |
textHost = self.host | ||
return self.replace(userinfo=new_userinfo, | ||
host=textHost, | ||
path=[_percent_decode(segment) | ||
path=[_decode_path_part(segment) | ||
for segment in self.path], | ||
query=[tuple(_percent_decode(x) | ||
query=[tuple(_decode_query_part(x) | ||
if x is not None else None | ||
for x in (k, v)) | ||
for k, v in self.query], | ||
fragment=_percent_decode(self.fragment)) | ||
fragment=_decode_fragment_part(self.fragment)) | ||
|
||
def to_text(self, with_password=False): | ||
"""Render this URL to its textual representation. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,6 +115,29 @@ | |
'20Linux%20precise-5.7.1.iso&tr=udp://tracker.openbittorrent.com:80&' | ||
'tr=udp://tracker.publicbt.com:80&tr=udp://tracker.istole.it:6969&' | ||
'tr=udp://tracker.ccc.de:80&tr=udp://open.demonii.com:1337'), | ||
|
||
# percent-encoded delimiters in percent-encodable fields | ||
|
||
'https://%3A@example.com/', # colon in username | ||
'https://%40@example.com/', # at sign in username | ||
'https://%2f@example.com/', # slash in username | ||
'https://a:%3a@example.com/', # colon in password | ||
'https://a:%40@example.com/', # at sign in password | ||
'https://a:%2f@example.com/', # slash in password | ||
'https://a:%3f@example.com/', # question mark in password | ||
'https://example.com/%2F/', # slash in path | ||
'https://example.com/%3F/', # question mark in path | ||
'https://example.com/%23/', # hash in path | ||
'https://example.com/?%23=b', # hash in query param name | ||
'https://example.com/?%3D=b', # equals in query param name | ||
'https://example.com/?%26=b', # ampersand in query param name | ||
'https://example.com/?a=%23', # hash in query param value | ||
'https://example.com/?a=%26', # ampersand in query param value | ||
'https://example.com/?a=%3D', # equals in query param value | ||
# double-encoded percent sign in all percent-encodable positions: | ||
"http://(%2525):(%2525)@example.com/(%2525)/?(%2525)=(%2525)#(%2525)", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
# colon in first part of schemeless relative url | ||
'first_seg_rel_path__colon%3Anotok/second_seg__colon%3Aok', | ||
) | ||
|
||
|
||
|
@@ -231,9 +254,21 @@ def test_roundtrip(self): | |
L{URL.to_text} should invert L{URL.from_text}. | ||
""" | ||
for test in ROUNDTRIP_TESTS: | ||
result = URL.from_text(test).to_text() | ||
result = URL.from_text(test).to_text(with_password=True) | ||
self.assertEqual(test, result) | ||
|
||
def test_roundtrip_double_iri(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks like a place where it might be interesting to drop in hypothesis (since it was in fact hypothesis that caught this bug, in txacme's test suite) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, i'm down for some hypothesis in the future. i played with it a bit on this PR, but haven't delved deep enough yet. |
||
for test in ROUNDTRIP_TESTS: | ||
url = URL.from_text(test) | ||
iri = url.to_iri() | ||
double_iri = iri.to_iri() | ||
assert iri == double_iri | ||
|
||
iri_text = iri.to_text(with_password=True) | ||
double_iri_text = double_iri.to_text(with_password=True) | ||
assert iri_text == double_iri_text | ||
return | ||
|
||
def test_equality(self): | ||
""" | ||
Two URLs decoded using L{URL.from_text} will be equal (C{==}) if they | ||
|
@@ -245,7 +280,7 @@ def test_equality(self): | |
self.assertNotEqual( | ||
urlpath, | ||
URL.from_text('ftp://www.anotherinvaliddomain.com/' | ||
'foo/bar/baz/?zot=21&zut') | ||
'foo/bar/baz/?zot=21&zut') | ||
) | ||
|
||
def test_fragmentEquality(self): | ||
|
@@ -906,6 +941,13 @@ def test_netloc_slashes(self): | |
url2 = url.replace(scheme='mailto') | ||
self.assertEquals(url2.to_text(), 'mailto:/path/to/heck') | ||
|
||
url_text = 'unregisteredscheme:///a/b/c' | ||
url = URL.from_text(url_text) | ||
no_netloc_url = url.replace(uses_netloc=False) | ||
self.assertEquals(no_netloc_url.to_text(), 'unregisteredscheme:/a/b/c') | ||
netloc_url = url.replace(uses_netloc=True) | ||
self.assertEquals(netloc_url.to_text(), url_text) | ||
|
||
return | ||
|
||
def test_wrong_constructor(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
minor: I think you could simplify this by using the
%x
format tokens:(I don’t know if there’s a performance difference, or if this code is particularly perf-critical)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This code is executed 4x at startup. Each time the loop happens <10 times. ;)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's shorter and easier to read, so the performance issue may be secondary :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In any case a follow-up PR to do this would be fine.