Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor _get_html_page() to use exceptions for flow control #5838

Merged
merged 14 commits into from
Oct 11, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions news/5838.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix content type detection if a directory named like an archive is used as a package source.
210 changes: 128 additions & 82 deletions src/pip/_internal/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,18 +59,113 @@
logger = logging.getLogger(__name__)


def _get_content_type(url, session):
"""Get the Content-Type of the given url, using a HEAD request"""
def _match_vcs_scheme(url):
"""Look for VCS schemes in the URL.

Returns the matched VCS scheme, or None if there's no match.
"""
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
return scheme
return None


def _is_url_like_archive(url):
"""Return whether the URL looks like an archive.
"""
filename = Link(url).filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
return True
return False


class _NotHTML(Exception):
uranusjr marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, content_type, request_desc):
super(_NotHTML, self).__init__(content_type, request_desc)
self.content_type = content_type
self.request_desc = request_desc


def _ensure_html_header(response):
"""Check the Content-Type header to ensure the response contains HTML.

Raises `_NotHTML` if the content type is not text/html.
"""
content_type = response.headers.get("Content-Type", "")
if not content_type.lower().startswith("text/html"):
raise _NotHTML(content_type, response.request.method)


class _NotHTTP(Exception):
pass


def _ensure_html_response(url, session):
"""Send a HEAD request to the URL, and ensure the response contains HTML.

Raises `_NotHTTP` if the URL is not available for a HEAD request, or
`_NotHTML` if the content type is not text/html.
uranusjr marked this conversation as resolved.
Show resolved Hide resolved
"""
scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
if scheme not in {'http', 'https'}:
# FIXME: some warning or something?
# assertion error?
return ''
raise _NotHTTP()
uranusjr marked this conversation as resolved.
Show resolved Hide resolved

resp = session.head(url, allow_redirects=True)
resp.raise_for_status()

return resp.headers.get("Content-Type", "")
_ensure_html_header(resp)


def _get_html_response(url, session):
"""Access an HTML page with GET, and return the response.

This consists of three parts:

1. If the URL looks suspiciously like an archive, send a HEAD first to
check the Content-Type is HTML, to avoid downloading a large file.
Raise `_NotHTTP` if the content type cannot be determined, or
`_NotHTML` if it is not HTML.
2. Actually perform the request. Raise HTTP exceptions on network failures.
3. Check the Content-Type header to make sure we got HTML, and raise
`_NotHTML` otherwise.
"""
if _is_url_like_archive(url):
_ensure_html_response(url, session=session)

logger.debug('Getting page %s', url)

resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
)
resp.raise_for_status()

# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
_ensure_html_header(resp)

return resp


def _handle_get_page_fail(link, reason, url, meth=None):
Expand All @@ -85,82 +180,36 @@ def _get_html_page(link, session=None):
"_get_html_page() missing 1 required keyword argument: 'session'"
)

url = link.url
url = url.split('#', 1)[0]
url = link.url.split('#', 1)[0]

# Check for VCS schemes that do not support lookup as web pages.
from pip._internal.vcs import VcsSupport
for scheme in VcsSupport.schemes:
if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
logger.debug('Cannot look at %s URL %s', scheme, link)
return None
vcs_scheme = _match_vcs_scheme(url)
if vcs_scheme:
logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
return None

try:
filename = link.filename
for bad_ext in ARCHIVE_EXTENSIONS:
if filename.endswith(bad_ext):
content_type = _get_content_type(url, session=session)
if content_type.lower().startswith('text/html'):
break
else:
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return
# Tack index.html onto file:// URLs that point to directories
scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)

logger.debug('Getting page %s', url)

# Tack index.html onto file:// URLs that point to directories
(scheme, netloc, path, params, query, fragment) = \
urllib_parse.urlparse(url)
if (scheme == 'file' and
os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)

resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
# trip for the conditional GET now instead of only
# once per 10 minutes.
# For more information, please see pypa/pip#5670.
"Cache-Control": "max-age=0",
},
try:
resp = _get_html_response(url, session=session)
except _NotHTTP as exc:
logger.debug(
'Skipping page %s because it looks like an archive, and cannot '
'be checked by HEAD.', link,
)
except _NotHTML as exc:
logger.debug(
'Skipping page %s because the %s request got Content-Type: %s',
link, exc.request_desc, exc.content_type,
)
resp.raise_for_status()

# The check for archives above only works if the url ends with
# something that looks like an archive. However that is not a
# requirement of an url. Unless we issue a HEAD request on every
# url we cannot know ahead of time for sure if something is HTML
# or not. However we can check after we've downloaded it.
content_type = resp.headers.get('Content-Type', 'unknown')
if not content_type.lower().startswith("text/html"):
logger.debug(
'Skipping page %s because of Content-Type: %s',
link,
content_type,
)
return

inst = HTMLPage(resp.content, resp.url, resp.headers)
except requests.HTTPError as exc:
_handle_get_page_fail(link, exc, url)
except SSLError as exc:
Expand All @@ -172,7 +221,7 @@ def _get_html_page(link, session=None):
except requests.Timeout:
_handle_get_page_fail(link, "timed out", url)
else:
return inst
return HTMLPage(resp.content, resp.url, resp.headers)


class PackageFinder(object):
Expand Down Expand Up @@ -672,7 +721,7 @@ def _get_pages(self, locations, project_name):
continue
seen.add(location)

page = self._get_page(location)
page = _get_html_page(location, session=self.session)
if page is None:
continue

Expand Down Expand Up @@ -789,9 +838,6 @@ def _link_package_versions(self, link, search):

return InstallationCandidate(search.supplied, version, link)

def _get_page(self, link):
return _get_html_page(link, session=self.session)


def egg_info_matches(
egg_info, search_name, link,
Expand Down
Loading