-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Gracefully fallback to html5lib for parsing non-compliant index pages
This reworks the HTML parsing logic, to gracefully use `html5lib` on non-compliant HTML 5 documents. This warning softens the failure mode for users who are using commercial package index solutions that do not follow the requisite standards and serve malformed HTML documents.
- Loading branch information
Showing
2 changed files
with
43 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ | |
from pip._internal.models.search_scope import SearchScope | ||
from pip._internal.network.session import PipSession | ||
from pip._internal.network.utils import raise_for_status | ||
from pip._internal.utils.deprecation import deprecated | ||
from pip._internal.utils.filetypes import is_archive_file | ||
from pip._internal.utils.misc import pairwise, redact_auth_from_url | ||
from pip._internal.vcs import vcs | ||
|
@@ -342,12 +343,34 @@ def parse_links(page: "HTMLPage", use_deprecated_html5lib: bool) -> Iterable[Lin | |
""" | ||
Parse an HTML document, and yield its anchor elements as Link objects. | ||
""" | ||
encoding = page.encoding or "utf-8" | ||
|
||
# Check if the page starts with a valid doctype, to decide whether to use | ||
# http.parser or (deprecated) html5lib for parsing -- unless explicitly | ||
# requested to use html5lib. | ||
if not use_deprecated_html5lib: | ||
expected_doctype = "<!doctype html>".encode(encoding) | ||
actual_start = page.content[: len(expected_doctype)] | ||
if actual_start.decode(encoding).lower() != "<!doctype html>": | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
pfmoore
Member
|
||
deprecated( | ||
reason=( | ||
f"The HTML index page being used ({page.url}) is not a proper " | ||
"HTML 5 document. This is in violation of PEP 503 which requires " | ||
"these pages to be well-formed HTML 5 documents. Please reach out " | ||
"to the owners of this index page, and ask them to update this " | ||
"index page to a valid HTML 5 document." | ||
), | ||
replacement=None, | ||
gone_in="22.2", | ||
issue=10825, | ||
) | ||
use_deprecated_html5lib = True | ||
|
||
if use_deprecated_html5lib: | ||
yield from _parse_links_html5lib(page) | ||
return | ||
|
||
parser = HTMLLinkParser() | ||
encoding = page.encoding or "utf-8" | ||
parser.feed(page.content.decode(encoding)) | ||
|
||
url = page.url | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
These three lines are not a correct check. The HTML5 spec says an HTML5 doc can start with an optional BOM, "Any number of comments and ASCII whitespace," and then the DOCTYPE, which can have "One or more ASCII whitespace," whereas this check is WAY more strict so will incorrectly say things are not valid that are valid. Why not just use a
try..except
around theparser.feed(page.content.decode(encoding))
call to lethtml.parser.HTMLParser
get the declaration for you and only fallback on exception?BTW, HTMLLinkParser.handle_decl() in this file is also way too strict. I think this is the correct code to use:
If you agree, I can create a pull request with those suggested changes.