Skip to content

Commit 73a4359

Browse files
committed
#15114: the strict mode and argument of HTMLParser, HTMLParser.error, and the HTMLParserError exception have been removed.
1 parent ffff144 commit 73a4359

File tree

4 files changed

+23
-197
lines changed

4 files changed

+23
-197
lines changed

Doc/library/html.parser.rst

+3-39
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,16 @@
1616
This module defines a class :class:`HTMLParser` which serves as the basis for
1717
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
1818

19-
.. class:: HTMLParser(strict=False, *, convert_charrefs=False)
19+
.. class:: HTMLParser(*, convert_charrefs=False)
2020

21-
Create a parser instance.
21+
Create a parser instance able to parse invalid markup.
2222

2323
If *convert_charrefs* is ``True`` (default: ``False``), all character
2424
references (except the ones in ``script``/``style`` elements) are
2525
automatically converted to the corresponding Unicode characters.
2626
The use of ``convert_charrefs=True`` is encouraged and will become
2727
the default in Python 3.5.
2828

29-
If *strict* is ``False`` (the default), the parser will accept and parse
30-
invalid markup. If *strict* is ``True`` the parser will raise an
31-
:exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not
32-
able to parse the markup. The use of ``strict=True`` is discouraged and
33-
the *strict* argument is deprecated.
34-
3529
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
3630
when start tags, end tags, text, comments, and other markup elements are
3731
encountered. The user should subclass :class:`.HTMLParser` and override its
@@ -40,32 +34,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
4034
This parser does not check that end tags match start tags or call the end-tag
4135
handler for elements which are closed implicitly by closing an outer element.
4236

43-
.. versionchanged:: 3.2
44-
*strict* argument added.
45-
46-
.. deprecated-removed:: 3.3 3.5
47-
The *strict* argument and the strict mode have been deprecated.
48-
The parser is now able to accept and parse invalid markup too.
49-
5037
.. versionchanged:: 3.4
5138
*convert_charrefs* keyword argument added.
5239

53-
An exception is defined as well:
54-
55-
56-
.. exception:: HTMLParseError
57-
58-
Exception raised by the :class:`HTMLParser` class when it encounters an error
59-
while parsing and *strict* is ``True``. This exception provides three
60-
attributes: :attr:`msg` is a brief message explaining the error,
61-
:attr:`lineno` is the number of the line on which the broken construct was
62-
detected, and :attr:`offset` is the number of characters into the line at
63-
which the construct starts.
64-
65-
.. deprecated-removed:: 3.3 3.5
66-
This exception has been deprecated because it's never raised by the parser
67-
(when the default non-strict mode is used).
68-
6940

7041
Example HTML Parser Application
7142
-------------------------------
@@ -246,8 +217,7 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
246217

247218
The *data* parameter will be the entire contents of the declaration inside
248219
the ``<![...]>`` markup. It is sometimes useful to be overridden by a
249-
derived class. The base class implementation raises an :exc:`HTMLParseError`
250-
when *strict* is ``True``.
220+
derived class. The base class implementation does nothing.
251221

252222

253223
.. _htmlparser-examples:
@@ -358,9 +328,3 @@ Parsing invalid HTML (e.g. unquoted attributes) also works::
358328
Data : tag soup
359329
End tag : p
360330
End tag : a
361-
362-
.. rubric:: Footnotes
363-
364-
.. [#] For backward compatibility reasons *strict* mode does not raise
365-
exceptions for all non-compliant HTML. That is, some invalid HTML
366-
is tolerated even in *strict* mode.

Lib/html/parser.py

+12-94
Original file line numberDiff line numberDiff line change
@@ -29,35 +29,15 @@
2929
piclose = re.compile('>')
3030
commentclose = re.compile(r'--\s*>')
3131
# Note:
32-
# 1) the strict attrfind isn't really strict, but we can't make it
33-
# correctly strict without breaking backward compatibility;
34-
# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
35-
# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
32+
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
33+
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
3634
# explode, so don't do it.
37-
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
3835
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
3936
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
4037
tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
41-
attrfind = re.compile(
42-
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
43-
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
4438
attrfind_tolerant = re.compile(
4539
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
4640
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
47-
locatestarttagend = re.compile(r"""
48-
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
49-
(?:\s+ # whitespace before attribute name
50-
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
51-
(?:\s*=\s* # value indicator
52-
(?:'[^']*' # LITA-enclosed value
53-
|\"[^\"]*\" # LIT-enclosed value
54-
|[^'\">\s]+ # bare value
55-
)
56-
)?
57-
)
58-
)*
59-
\s* # trailing whitespace
60-
""", re.VERBOSE)
6141
locatestarttagend_tolerant = re.compile(r"""
6242
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
6343
(?:[\s/]* # optional whitespace before attribute name
@@ -79,24 +59,6 @@
7959
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
8060

8161

82-
class HTMLParseError(Exception):
83-
"""Exception raised for all parse errors."""
84-
85-
def __init__(self, msg, position=(None, None)):
86-
assert msg
87-
self.msg = msg
88-
self.lineno = position[0]
89-
self.offset = position[1]
90-
91-
def __str__(self):
92-
result = self.msg
93-
if self.lineno is not None:
94-
result = result + ", at line %d" % self.lineno
95-
if self.offset is not None:
96-
result = result + ", column %d" % (self.offset + 1)
97-
return result
98-
99-
10062
_default_sentinel = object()
10163

10264
class HTMLParser(_markupbase.ParserBase):
@@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase):
12385

12486
CDATA_CONTENT_ELEMENTS = ("script", "style")
12587

126-
def __init__(self, strict=_default_sentinel, *,
127-
convert_charrefs=_default_sentinel):
88+
def __init__(self, *, convert_charrefs=_default_sentinel):
12889
"""Initialize and reset this instance.
12990
13091
If convert_charrefs is True (default: False), all character references
13192
are automatically converted to the corresponding Unicode characters.
132-
If strict is set to False (the default) the parser will parse invalid
133-
markup, otherwise it will raise an error. Note that the strict mode
134-
and argument are deprecated.
13593
"""
136-
if strict is not _default_sentinel:
137-
warnings.warn("The strict argument and mode are deprecated.",
138-
DeprecationWarning, stacklevel=2)
139-
else:
140-
strict = False # default
141-
self.strict = strict
14294
if convert_charrefs is _default_sentinel:
14395
convert_charrefs = False # default
14496
warnings.warn("The value of convert_charrefs will become True in "
@@ -168,11 +120,6 @@ def close(self):
168120
"""Handle any buffered data."""
169121
self.goahead(1)
170122

171-
def error(self, message):
172-
warnings.warn("The 'error' method is deprecated.",
173-
DeprecationWarning, stacklevel=2)
174-
raise HTMLParseError(message, self.getpos())
175-
176123
__starttag_text = None
177124

178125
def get_starttag_text(self):
@@ -227,10 +174,7 @@ def goahead(self, end):
227174
elif startswith("<?", i):
228175
k = self.parse_pi(i)
229176
elif startswith("<!", i):
230-
if self.strict:
231-
k = self.parse_declaration(i)
232-
else:
233-
k = self.parse_html_declaration(i)
177+
k = self.parse_html_declaration(i)
234178
elif (i + 1) < n:
235179
self.handle_data("<")
236180
k = i + 1
@@ -239,8 +183,6 @@ def goahead(self, end):
239183
if k < 0:
240184
if not end:
241185
break
242-
if self.strict:
243-
self.error("EOF in middle of construct")
244186
k = rawdata.find('>', i + 1)
245187
if k < 0:
246188
k = rawdata.find('<', i + 1)
@@ -282,13 +224,10 @@ def goahead(self, end):
282224
if match:
283225
# match.group() will contain at least 2 chars
284226
if end and match.group() == rawdata[i:]:
285-
if self.strict:
286-
self.error("EOF in middle of entity or char ref")
287-
else:
288-
k = match.end()
289-
if k <= i:
290-
k = n
291-
i = self.updatepos(i, i + 1)
227+
k = match.end()
228+
if k <= i:
229+
k = n
230+
i = self.updatepos(i, i + 1)
292231
# incomplete
293232
break
294233
elif (i + 1) < n:
@@ -367,18 +306,12 @@ def parse_starttag(self, i):
367306

368307
# Now parse the data between i+1 and j into a tag and attrs
369308
attrs = []
370-
if self.strict:
371-
match = tagfind.match(rawdata, i+1)
372-
else:
373-
match = tagfind_tolerant.match(rawdata, i+1)
309+
match = tagfind_tolerant.match(rawdata, i+1)
374310
assert match, 'unexpected call to parse_starttag()'
375311
k = match.end()
376312
self.lasttag = tag = match.group(1).lower()
377313
while k < endpos:
378-
if self.strict:
379-
m = attrfind.match(rawdata, k)
380-
else:
381-
m = attrfind_tolerant.match(rawdata, k)
314+
m = attrfind_tolerant.match(rawdata, k)
382315
if not m:
383316
break
384317
attrname, rest, attrvalue = m.group(1, 2, 3)
@@ -401,9 +334,6 @@ def parse_starttag(self, i):
401334
- self.__starttag_text.rfind("\n")
402335
else:
403336
offset = offset + len(self.__starttag_text)
404-
if self.strict:
405-
self.error("junk characters in start tag: %r"
406-
% (rawdata[k:endpos][:20],))
407337
self.handle_data(rawdata[i:endpos])
408338
return endpos
409339
if end.endswith('/>'):
@@ -419,10 +349,7 @@ def parse_starttag(self, i):
419349
# or -1 if incomplete.
420350
def check_for_whole_start_tag(self, i):
421351
rawdata = self.rawdata
422-
if self.strict:
423-
m = locatestarttagend.match(rawdata, i)
424-
else:
425-
m = locatestarttagend_tolerant.match(rawdata, i)
352+
m = locatestarttagend_tolerant.match(rawdata, i)
426353
if m:
427354
j = m.end()
428355
next = rawdata[j:j+1]
@@ -435,9 +362,6 @@ def check_for_whole_start_tag(self, i):
435362
# buffer boundary
436363
return -1
437364
# else bogus input
438-
if self.strict:
439-
self.updatepos(i, j + 1)
440-
self.error("malformed empty start tag")
441365
if j > i:
442366
return j
443367
else:
@@ -450,9 +374,6 @@ def check_for_whole_start_tag(self, i):
450374
# end of input in or before attribute value, or we have the
451375
# '/' from a '/>' ending
452376
return -1
453-
if self.strict:
454-
self.updatepos(i, j)
455-
self.error("malformed start tag")
456377
if j > i:
457378
return j
458379
else:
@@ -472,8 +393,6 @@ def parse_endtag(self, i):
472393
if self.cdata_elem is not None:
473394
self.handle_data(rawdata[i:gtpos])
474395
return gtpos
475-
if self.strict:
476-
self.error("bad end tag: %r" % (rawdata[i:gtpos],))
477396
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
478397
namematch = tagfind_tolerant.match(rawdata, i+2)
479398
if not namematch:
@@ -539,8 +458,7 @@ def handle_pi(self, data):
539458
pass
540459

541460
def unknown_decl(self, data):
542-
if self.strict:
543-
self.error("unknown declaration: %r" % (data,))
461+
pass
544462

545463
# Internal -- helper to remove special character quoting
546464
def unescape(self, s):

0 commit comments

Comments
 (0)