Skip to content

Fix percent decoding and normalization #23

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 3, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 80 additions & 22 deletions hyperlink/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@

import re
import string

import socket
from unicodedata import normalize
try:
from socket import inet_pton
except ImportError:
Expand Down Expand Up @@ -52,13 +52,6 @@ def inet_pton(address_family, ip_string):
raise socket.error('unknown address family')


try:
from urllib import unquote as urlunquote
except ImportError:
from urllib.parse import unquote_to_bytes as urlunquote

from unicodedata import normalize

unicode = type(u'')
try:
unichr
Expand Down Expand Up @@ -141,7 +134,6 @@ def __nonzero__(self):
for a in string.hexdigits for b in string.hexdigits])
_ASCII_RE = re.compile('([\x00-\x7f]+)')


# RFC 3986 section 2.2, Reserved Characters
# https://tools.ietf.org/html/rfc3986#section-2.2
_GEN_DELIMS = frozenset(u':/?#[]@')
Expand All @@ -160,6 +152,19 @@ def __nonzero__(self):
_QUERY_DELIMS = _ALL_DELIMS - _QUERY_SAFE


def _make_decode_map(delims, allow_percent=False):
ret = dict(_HEX_CHAR_MAP)
if not allow_percent:
delims = set(delims) | set([u'%'])
for delim in delims:
_hexord = hex(ord(delim))[2:].zfill(2).encode('ascii').upper()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: I think you could simplify this by using the %x format tokens:

_hexord = format(ord(delim), '02X').encode('ascii')

(I don’t know if there’s a performance difference, or if this code is particularly perf-critical)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is executed 4x at startup. Each time the loop happens <10 times. ;)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's shorter and easier to read, so the performance issue may be secondary :)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In any case a follow-up PR to do this would be fine.

_hexord_lower = _hexord.lower()
ret.pop(_hexord)
if _hexord != _hexord_lower:
ret.pop(_hexord_lower)
return ret


def _make_quote_map(safe_chars):
ret = {}
# v is included in the dict for py3 mostly, because bytestrings
Expand All @@ -174,10 +179,14 @@ def _make_quote_map(safe_chars):


_USERINFO_PART_QUOTE_MAP = _make_quote_map(_USERINFO_SAFE)
_USERINFO_DECODE_MAP = _make_decode_map(_USERINFO_DELIMS)
_PATH_PART_QUOTE_MAP = _make_quote_map(_PATH_SAFE)
_SCHEMELESS_PATH_PART_QUOTE_MAP = _make_quote_map(_SCHEMELESS_PATH_SAFE)
_PATH_DECODE_MAP = _make_decode_map(_PATH_DELIMS)
_QUERY_PART_QUOTE_MAP = _make_quote_map(_QUERY_SAFE)
_QUERY_DECODE_MAP = _make_decode_map(_QUERY_DELIMS)
_FRAGMENT_QUOTE_MAP = _make_quote_map(_FRAGMENT_SAFE)
_FRAGMENT_DECODE_MAP = _make_decode_map(_FRAGMENT_DELIMS)

_ROOT_PATHS = frozenset(((), (u'',)))

Expand Down Expand Up @@ -276,6 +285,7 @@ def _encode_userinfo_part(text, maximal=True):
else t for t in text])



# This port list painstakingly curated by hand searching through
# https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
# and
Expand Down Expand Up @@ -413,23 +423,68 @@ def _textcheck(name, value, delims=frozenset(), nullable=False):
return value


def _percent_decode(text):
"""
Replace percent-encoded characters with their UTF-8 equivalents.
def _decode_userinfo_part(text):
return _percent_decode(text, _decode_map=_USERINFO_DECODE_MAP)


def _decode_path_part(text):
return _percent_decode(text, _decode_map=_PATH_DECODE_MAP)


def _decode_query_part(text):
return _percent_decode(text, _decode_map=_QUERY_DECODE_MAP)


def _decode_fragment_part(text):
return _percent_decode(text, _decode_map=_FRAGMENT_DECODE_MAP)


def _percent_decode(text, _decode_map=_HEX_CHAR_MAP):
"""Convert percent-encoded text characters to their normal,
human-readable equivalents.

All characters in the input text must be valid ASCII. All special
characters underlying the values in the percent-encoding must be
valid UTF-8.

Only called by field-tailored variants, e.g.,
:func:`_decode_path_part`, as every percent-encodable part of the
URL has characters which should not be percent decoded.

>>> _percent_decode(u'abc%20def')
u'abc def'

Args:
text (unicode): The text with percent-encoded UTF-8 in it.
text (unicode): The ASCII text with percent-encoding present.

Returns:
unicode: The encoded version of *text*.
unicode: The percent-decoded version of *text*, with UTF-8
decoding applied.
"""
try:
quotedBytes = text.encode("ascii")
quoted_bytes = text.encode("ascii")
except UnicodeEncodeError:
return text
unquotedBytes = urlunquote(quotedBytes)

bits = quoted_bytes.split(b'%')
if len(bits) == 1:
return text

res = [bits[0]]
append = res.append

for item in bits[1:]:
try:
append(_decode_map[item[:2]])
append(item[2:])
except KeyError:
append(b'%')
append(item)

unquoted_bytes = b''.join(res)

try:
return unquotedBytes.decode("utf-8")
return unquoted_bytes.decode("utf-8")
except UnicodeDecodeError:
return text

Expand Down Expand Up @@ -819,7 +874,8 @@ def absolute(self):
return bool(self.scheme and self.host)

def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET,
fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET):
fragment=_UNSET, port=_UNSET, rooted=_UNSET, userinfo=_UNSET,
family=_UNSET, uses_netloc=_UNSET):
""":class:`URL` objects are immutable, which means that attributes
are designed to be set only once, at construction. Instead of
modifying an existing URL, one simply creates a copy with the
Expand Down Expand Up @@ -861,6 +917,8 @@ def replace(self, scheme=_UNSET, host=_UNSET, path=_UNSET, query=_UNSET,
port=_optional(port, self.port),
rooted=_optional(rooted, self.rooted),
userinfo=_optional(userinfo, self.userinfo),
family=_optional(family, self.family),
uses_netloc=_optional(uses_netloc, self.uses_netloc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems incorrect. You shouldn't be able to specify uses_netloc to the constructor at all, it's entirely a function of the scheme; adding it to replace seems to be propagating the error.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirming this, deleting these added lines doesn't make any tests fail; if it should be here at all, it shouldn't be in this change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Functionally, uses_netloc is there because for unrecognized schemes parsed using from_text, URL will persist whether or not the original URL used the netloc slashes.

Logistically, I can't remember if I intended to include it in this PR or if I just saw the big regression of not passing through family and uses_netloc -- both in the constructor and the docstring of replace, but missing in the function signature itself -- and fixed it post haste.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's worth working it's worth testing :-). Urgency notwithstanding it seems like an unrelated change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the branch started out as seeking to normalize decoding in general, but it became pretty percent-decoding centric, as idna improvements will come in another PR.

I've been brainstorming other ways to go with some of these latter-day arguments, but for now I think I prefer bringing things into sync, so I've added a test.

)

@classmethod
Expand Down Expand Up @@ -1095,7 +1153,7 @@ def to_iri(self):
URL: A new instance with its path segments, query parameters, and
hostname decoded for display purposes.
"""
new_userinfo = u':'.join([_percent_decode(p) for p in
new_userinfo = u':'.join([_decode_userinfo_part(p) for p in
self.userinfo.split(':', 1)])
try:
asciiHost = self.host.encode("ascii")
Expand All @@ -1109,13 +1167,13 @@ def to_iri(self):
textHost = self.host
return self.replace(userinfo=new_userinfo,
host=textHost,
path=[_percent_decode(segment)
path=[_decode_path_part(segment)
for segment in self.path],
query=[tuple(_percent_decode(x)
query=[tuple(_decode_query_part(x)
if x is not None else None
for x in (k, v))
for k, v in self.query],
fragment=_percent_decode(self.fragment))
fragment=_decode_fragment_part(self.fragment))

def to_text(self, with_password=False):
"""Render this URL to its textual representation.
Expand Down
46 changes: 44 additions & 2 deletions hyperlink/test/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,29 @@
'20Linux%20precise-5.7.1.iso&tr=udp://tracker.openbittorrent.com:80&'
'tr=udp://tracker.publicbt.com:80&tr=udp://tracker.istole.it:6969&'
'tr=udp://tracker.ccc.de:80&tr=udp://open.demonii.com:1337'),

# percent-encoded delimiters in percent-encodable fields

'https://%3A@example.com/', # colon in username
'https://%40@example.com/', # at sign in username
'https://%2f@example.com/', # slash in username
'https://a:%3a@example.com/', # colon in password
'https://a:%40@example.com/', # at sign in password
'https://a:%2f@example.com/', # slash in password
'https://a:%3f@example.com/', # question mark in password
'https://example.com/%2F/', # slash in path
'https://example.com/%3F/', # question mark in path
'https://example.com/%23/', # hash in path
'https://example.com/?%23=b', # hash in query param name
'https://example.com/?%3D=b', # equals in query param name
'https://example.com/?%26=b', # ampersand in query param name
'https://example.com/?a=%23', # hash in query param value
'https://example.com/?a=%26', # ampersand in query param value
'https://example.com/?a=%3D', # equals in query param value
# double-encoded percent sign in all percent-encodable positions:
"http://(%2525):(%2525)@example.com/(%2525)/?(%2525)=(%2525)#(%2525)",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

# colon in first part of schemeless relative url
'first_seg_rel_path__colon%3Anotok/second_seg__colon%3Aok',
)


Expand Down Expand Up @@ -231,9 +254,21 @@ def test_roundtrip(self):
L{URL.to_text} should invert L{URL.from_text}.
"""
for test in ROUNDTRIP_TESTS:
result = URL.from_text(test).to_text()
result = URL.from_text(test).to_text(with_password=True)
self.assertEqual(test, result)

def test_roundtrip_double_iri(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a place where it might be interesting to drop in hypothesis (since it was in fact hypothesis that caught this bug, in txacme's test suite)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i'm down for some hypothesis in the future. i played with it a bit on this PR, but haven't delved deep enough yet.

for test in ROUNDTRIP_TESTS:
url = URL.from_text(test)
iri = url.to_iri()
double_iri = iri.to_iri()
assert iri == double_iri

iri_text = iri.to_text(with_password=True)
double_iri_text = double_iri.to_text(with_password=True)
assert iri_text == double_iri_text
return

def test_equality(self):
"""
Two URLs decoded using L{URL.from_text} will be equal (C{==}) if they
Expand All @@ -245,7 +280,7 @@ def test_equality(self):
self.assertNotEqual(
urlpath,
URL.from_text('ftp://www.anotherinvaliddomain.com/'
'foo/bar/baz/?zot=21&zut')
'foo/bar/baz/?zot=21&zut')
)

def test_fragmentEquality(self):
Expand Down Expand Up @@ -906,6 +941,13 @@ def test_netloc_slashes(self):
url2 = url.replace(scheme='mailto')
self.assertEquals(url2.to_text(), 'mailto:/path/to/heck')

url_text = 'unregisteredscheme:///a/b/c'
url = URL.from_text(url_text)
no_netloc_url = url.replace(uses_netloc=False)
self.assertEquals(no_netloc_url.to_text(), 'unregisteredscheme:/a/b/c')
netloc_url = url.replace(uses_netloc=True)
self.assertEquals(netloc_url.to_text(), url_text)

return

def test_wrong_constructor(self):
Expand Down