Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect quoting Link.url #7596

Merged
merged 5 commits into from
Feb 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions news/6446.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Correctly handle ``%2F`` in URL parameters to avoid accidentally unescape them
into ``/``.
81 changes: 61 additions & 20 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import logging
import mimetypes
import os
import re
from collections import OrderedDict

from pip._vendor import html5lib, requests
Expand All @@ -17,7 +18,7 @@

from pip._internal.models.link import Link
from pip._internal.utils.filetypes import ARCHIVE_EXTENSIONS
from pip._internal.utils.misc import redact_auth_from_url
from pip._internal.utils.misc import pairwise, redact_auth_from_url
from pip._internal.utils.typing import MYPY_CHECK_RUNNING
from pip._internal.utils.urls import path_to_url, url_to_path
from pip._internal.vcs import is_url, vcs
Expand Down Expand Up @@ -186,29 +187,69 @@ def _determine_base_url(document, page_url):
return page_url


def _clean_url_path_part(part):
# type: (str) -> str
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
return urllib_parse.quote(urllib_parse.unquote(part))


def _clean_file_url_path(part):
# type: (str) -> str
"""
Clean the first part of a URL path that corresponds to a local
filesystem path (i.e. the first part after splitting on "@" characters).
"""
# We unquote prior to quoting to make sure nothing is double quoted.
# Also, on Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
return urllib_request.pathname2url(urllib_request.url2pathname(part))


# percent-encoded: /
_reserved_chars_re = re.compile('(@|%2F)', re.IGNORECASE)


def _clean_url_path(path, is_local_path):
# type: (str, bool) -> str
"""
Clean the path portion of a URL.
"""
if is_local_path:
clean_func = _clean_file_url_path
else:
clean_func = _clean_url_path_part

# Split on the reserved characters prior to cleaning so that
# revision strings in VCS URLs are properly preserved.
parts = _reserved_chars_re.split(path)

cleaned_parts = []
for to_clean, reserved in pairwise(itertools.chain(parts, [''])):
cleaned_parts.append(clean_func(to_clean))
# Normalize %xx escapes (e.g. %2f -> %2F)
cleaned_parts.append(reserved.upper())

return ''.join(cleaned_parts)


def _clean_link(url):
# type: (str) -> str
"""Makes sure a link is fully encoded. That is, if a ' ' shows up in
the link, it will be rewritten to %20 (while not over-quoting
% or other characters)."""
"""
Make sure a link is fully quoted.
For example, if ' ' occurs in the URL, it will be replaced with "%20",
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`. Note that the
# `netloc` can be empty and the URI will then refer to a local
# filesystem path.
# `scheme://netloc/path;parameters?query#fragment`.
result = urllib_parse.urlparse(url)
# In both cases below we unquote prior to quoting to make sure
# nothing is double quoted.
if result.netloc == "":
# On Windows the path part might contain a drive letter which
# should not be quoted. On Linux where drive letters do not
# exist, the colon should be quoted. We rely on urllib.request
# to do the right thing here.
path = urllib_request.pathname2url(
urllib_request.url2pathname(result.path))
else:
# In addition to the `/` character we protect `@` so that
# revision strings in VCS URLs are properly parsed.
path = urllib_parse.quote(urllib_parse.unquote(result.path), safe="/@")
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib_parse.urlunparse(result._replace(path=path))


Expand Down
16 changes: 14 additions & 2 deletions src/pip/_internal/utils/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
# why we ignore the type on this import.
from pip._vendor.retrying import retry # type: ignore
from pip._vendor.six import PY2, text_type
from pip._vendor.six.moves import input
from pip._vendor.six.moves import input, zip_longest
from pip._vendor.six.moves.urllib import parse as urllib_parse
from pip._vendor.six.moves.urllib.parse import unquote as urllib_unquote

Expand Down Expand Up @@ -52,7 +52,7 @@

if MYPY_CHECK_RUNNING:
from typing import (
Any, AnyStr, Container, Iterable, List, Optional, Text,
Any, AnyStr, Container, Iterable, Iterator, List, Optional, Text,
Tuple, Union,
)
from pip._vendor.pkg_resources import Distribution
Expand Down Expand Up @@ -884,3 +884,15 @@ def is_wheel_installed():
return False

return True


def pairwise(iterable):
# type: (Iterable[Any]) -> Iterator[Tuple[Any, Any]]
"""
Return paired elements.

For example:
s -> (s0, s1), (s2, s3), (s4, s5), ...
"""
iterable = iter(iterable)
return zip_longest(iterable, iterable)
88 changes: 87 additions & 1 deletion tests/unit/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pip._internal.index.collector import (
HTMLPage,
_clean_link,
_clean_url_path,
_determine_base_url,
_get_html_page,
_get_html_response,
Expand Down Expand Up @@ -191,6 +192,69 @@ def test_determine_base_url(html, url, expected):
assert _determine_base_url(document, url) == expected


@pytest.mark.parametrize(
('path', 'expected'),
[
# Test a character that needs quoting.
('a b', 'a%20b'),
# Test an unquoted "@".
('a @ b', 'a%20@%20b'),
# Test multiple unquoted "@".
('a @ @ b', 'a%20@%20@%20b'),
# Test a quoted "@".
('a %40 b', 'a%20%40%20b'),
# Test a quoted "@" before an unquoted "@".
('a %40b@ c', 'a%20%40b@%20c'),
# Test a quoted "@" after an unquoted "@".
('a @b%40 c', 'a%20@b%40%20c'),
# Test alternating quoted and unquoted "@".
('a %40@b %40@c %40', 'a%20%40@b%20%40@c%20%40'),
# Test an unquoted "/".
('a / b', 'a%20/%20b'),
# Test multiple unquoted "/".
('a / / b', 'a%20/%20/%20b'),
# Test a quoted "/".
('a %2F b', 'a%20%2F%20b'),
# Test a quoted "/" before an unquoted "/".
('a %2Fb/ c', 'a%20%2Fb/%20c'),
# Test a quoted "/" after an unquoted "/".
('a /b%2F c', 'a%20/b%2F%20c'),
# Test alternating quoted and unquoted "/".
('a %2F/b %2F/c %2F', 'a%20%2F/b%20%2F/c%20%2F'),
# Test normalizing non-reserved quoted characters "[" and "]"
('a %5b %5d b', 'a%20%5B%20%5D%20b'),
# Test normalizing a reserved quoted "/"
('a %2f b', 'a%20%2F%20b'),
]
)
@pytest.mark.parametrize('is_local_path', [True, False])
def test_clean_url_path(path, expected, is_local_path):
assert _clean_url_path(path, is_local_path=is_local_path) == expected


@pytest.mark.parametrize(
('path', 'expected'),
[
# Test a VCS path with a Windows drive letter and revision.
pytest.param(
'/T:/with space/repo.git@1.0',
'///T:/with%20space/repo.git@1.0',
marks=pytest.mark.skipif("sys.platform != 'win32'"),
),
# Test a VCS path with a Windows drive letter and revision,
# running on non-windows platform.
pytest.param(
'/T:/with space/repo.git@1.0',
'/T%3A/with%20space/repo.git@1.0',
marks=pytest.mark.skipif("sys.platform == 'win32'"),
),
]
)
def test_clean_url_path_with_local_path(path, expected):
actual = _clean_url_path(path, is_local_path=True)
assert actual == expected


@pytest.mark.parametrize(
("url", "clean_url"),
[
Expand Down Expand Up @@ -218,9 +282,18 @@ def test_determine_base_url(html, url, expected):
# not. The `:` should be quoted.
("https://localhost.localdomain/T:/path/",
"https://localhost.localdomain/T%3A/path/"),
# URL with a quoted "/" in the path portion.
("https://example.com/access%2Ftoken/path/",
"https://example.com/access%2Ftoken/path/"),
# VCS URL containing revision string.
("git+ssh://example.com/path to/repo.git@1.0#egg=my-package-1.0",
"git+ssh://example.com/path%20to/repo.git@1.0#egg=my-package-1.0"),
# VCS URL with a quoted "#" in the revision string.
("git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0",
"git+https://example.com/repo.git@hash%23symbol#egg=my-package-1.0"),
# VCS URL with a quoted "@" in the revision string.
("git+https://example.com/repo.git@at%40 space#egg=my-package-1.0",
"git+https://example.com/repo.git@at%40%20space#egg=my-package-1.0"),
uranusjr marked this conversation as resolved.
Show resolved Hide resolved
# URL with Windows drive letter. The `:` after the drive
# letter should not be quoted. The trailing `/` should be
# removed.
Expand All @@ -236,10 +309,23 @@ def test_determine_base_url(html, url, expected):
"file:///T%3A/path/with%20spaces/",
marks=pytest.mark.skipif("sys.platform == 'win32'"),
),
# Test a VCS URL with a Windows drive letter and revision.
pytest.param(
"git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0",
"git+file:///T:/with%20space/repo.git@1.0#egg=my-package-1.0",
marks=pytest.mark.skipif("sys.platform != 'win32'"),
),
# Test a VCS URL with a Windows drive letter and revision,
# running on non-windows platform.
pytest.param(
"git+file:///T:/with space/repo.git@1.0#egg=my-package-1.0",
"git+file:/T%3A/with%20space/repo.git@1.0#egg=my-package-1.0",
marks=pytest.mark.skipif("sys.platform == 'win32'"),
),
]
)
def test_clean_link(url, clean_url):
assert(_clean_link(url) == clean_url)
assert _clean_link(url) == clean_url


@pytest.mark.parametrize('anchor_html, expected', [
Expand Down