Skip to content

Commit 6725dad

Browse files
serhiy-storchakaezio-melottiwaylan
authored andcommitted
[3.12] pythongh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (pythonGH-135930) (pythonGH-136268)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section. * Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `. * Null character (U+0000) no longer ends the tag name. * Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`. * Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`. * Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar". * Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None. * Fix data loss after unclosed script or style tag (pythongh-86155). Also backport test.support.subTests() (pythongh-135120). --------- (cherry picked from commit 0243f97) (cherry picked from commit c555f88) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> Co-authored-by: Waylan Limberg <waylan.limberg@icloud.com>
1 parent 8d1b3df commit 6725dad

File tree

5 files changed

+222
-120
lines changed

5 files changed

+222
-120
lines changed

Lib/html/parser.py

Lines changed: 70 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,43 @@
2929
piclose = re.compile('>')
3030
commentclose = re.compile(r'--\s*>')
3131
# Note:
32-
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
33-
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
32+
# 1) if you change tagfind/attrfind remember to update locatetagend too;
33+
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
3434
# explode, so don't do it.
35-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
36-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
37-
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
38-
attrfind_tolerant = re.compile(
39-
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
40-
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
35+
# see the HTML5 specs section "13.2.5.6 Tag open state",
36+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
37+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
38+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
39+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
40+
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
41+
attrfind_tolerant = re.compile(r"""
42+
(
43+
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
44+
)
45+
(= # value indicator
46+
('[^']*' # LITA-enclosed value
47+
|"[^"]*" # LIT-enclosed value
48+
|(?!['"])[^>\t\n\r\f ]* # bare value
49+
)
50+
)?
51+
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
52+
""", re.VERBOSE)
53+
locatetagend = re.compile(r"""
54+
[a-zA-Z][^\t\n\r\f />]* # tag name
55+
[\t\n\r\f /]* # optional whitespace before attribute name
56+
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
57+
(?:= # value indicator
58+
(?:'[^']*' # LITA-enclosed value
59+
|"[^"]*" # LIT-enclosed value
60+
|(?!['"])[^>\t\n\r\f ]* # bare value
61+
)
62+
)?
63+
[\t\n\r\f /]* # possibly followed by a space
64+
)*
65+
>?
66+
""", re.VERBOSE)
67+
# The following variables are not used, but are temporarily left for
68+
# backward compatibility.
4169
locatestarttagend_tolerant = re.compile(r"""
4270
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4371
(?:[\s/]* # optional whitespace before attribute name
@@ -54,8 +82,6 @@
5482
\s* # trailing whitespace
5583
""", re.VERBOSE)
5684
endendtag = re.compile('>')
57-
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
58-
# </ and the tag name, so maybe this should be fixed
5985
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6086

6187

@@ -122,7 +148,8 @@ def get_starttag_text(self):
122148

123149
def set_cdata_mode(self, elem):
124150
self.cdata_elem = elem.lower()
125-
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
151+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
152+
re.IGNORECASE|re.ASCII)
126153

127154
def clear_cdata_mode(self):
128155
self.interesting = interesting_normal
@@ -147,7 +174,7 @@ def goahead(self, end):
147174
# & near the end and see if it's followed by a space or ;.
148175
amppos = rawdata.rfind('&', max(i, n-34))
149176
if (amppos >= 0 and
150-
not re.compile(r'[\s;]').search(rawdata, amppos)):
177+
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
151178
break # wait till we get all the text
152179
j = n
153180
else:
@@ -260,7 +287,7 @@ def goahead(self, end):
260287
else:
261288
assert 0, "interesting.search() lied"
262289
# end while
263-
if end and i < n and not self.cdata_elem:
290+
if end and i < n:
264291
if self.convert_charrefs and not self.cdata_elem:
265292
self.handle_data(unescape(rawdata[i:n]))
266293
else:
@@ -291,7 +318,7 @@ def parse_html_declaration(self, i):
291318
return self.parse_bogus_comment(i)
292319

293320
# Internal -- parse bogus comment, return length or -1 if not terminated
294-
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
321+
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
295322
def parse_bogus_comment(self, i, report=1):
296323
rawdata = self.rawdata
297324
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
@@ -317,6 +344,8 @@ def parse_pi(self, i):
317344

318345
# Internal -- handle starttag, return end or -1 if not terminated
319346
def parse_starttag(self, i):
347+
# See the HTML5 specs section "13.2.5.8 Tag name state"
348+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
320349
self.__starttag_text = None
321350
endpos = self.check_for_whole_start_tag(i)
322351
if endpos < 0:
@@ -369,76 +398,42 @@ def parse_starttag(self, i):
369398
# or -1 if incomplete.
370399
def check_for_whole_start_tag(self, i):
371400
rawdata = self.rawdata
372-
m = locatestarttagend_tolerant.match(rawdata, i)
373-
if m:
374-
j = m.end()
375-
next = rawdata[j:j+1]
376-
if next == ">":
377-
return j + 1
378-
if next == "/":
379-
if rawdata.startswith("/>", j):
380-
return j + 2
381-
if rawdata.startswith("/", j):
382-
# buffer boundary
383-
return -1
384-
# else bogus input
385-
if j > i:
386-
return j
387-
else:
388-
return i + 1
389-
if next == "":
390-
# end of input
391-
return -1
392-
if next in ("abcdefghijklmnopqrstuvwxyz=/"
393-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
394-
# end of input in or before attribute value, or we have the
395-
# '/' from a '/>' ending
396-
return -1
397-
if j > i:
398-
return j
399-
else:
400-
return i + 1
401-
raise AssertionError("we should not get here!")
401+
match = locatetagend.match(rawdata, i+1)
402+
assert match
403+
j = match.end()
404+
if rawdata[j-1] != ">":
405+
return -1
406+
return j
402407

403408
# Internal -- parse endtag, return end or -1 if incomplete
404409
def parse_endtag(self, i):
410+
# See the HTML5 specs section "13.2.5.7 End tag open state"
411+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
405412
rawdata = self.rawdata
406413
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
407-
match = endendtag.search(rawdata, i+1) # >
408-
if not match:
414+
if rawdata.find('>', i+2) < 0: # fast check
409415
return -1
410-
gtpos = match.end()
411-
match = endtagfind.match(rawdata, i) # </ + tag + >
412-
if not match:
413-
if self.cdata_elem is not None:
414-
self.handle_data(rawdata[i:gtpos])
415-
return gtpos
416-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
417-
namematch = tagfind_tolerant.match(rawdata, i+2)
418-
if not namematch:
419-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
420-
if rawdata[i:i+3] == '</>':
421-
return i+3
422-
else:
423-
return self.parse_bogus_comment(i)
424-
tagname = namematch.group(1).lower()
425-
# consume and ignore other stuff between the name and the >
426-
# Note: this is not 100% correct, since we might have things like
427-
# </tag attr=">">, but looking for > after the name should cover
428-
# most of the cases and is much simpler
429-
gtpos = rawdata.find('>', namematch.end())
430-
self.handle_endtag(tagname)
431-
return gtpos+1
416+
if not endtagopen.match(rawdata, i): # </ + letter
417+
if rawdata[i+2:i+3] == '>': # </> is ignored
418+
# "missing-end-tag-name" parser error
419+
return i+3
420+
else:
421+
return self.parse_bogus_comment(i)
432422

433-
elem = match.group(1).lower() # script or style
434-
if self.cdata_elem is not None:
435-
if elem != self.cdata_elem:
436-
self.handle_data(rawdata[i:gtpos])
437-
return gtpos
423+
match = locatetagend.match(rawdata, i+2)
424+
assert match
425+
j = match.end()
426+
if rawdata[j-1] != ">":
427+
return -1
438428

439-
self.handle_endtag(elem)
429+
# find the name: "13.2.5.8 Tag name state"
430+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
431+
match = tagfind_tolerant.match(rawdata, i+2)
432+
assert match
433+
tag = match.group(1).lower()
434+
self.handle_endtag(tag)
440435
self.clear_cdata_mode()
441-
return gtpos
436+
return j
442437

443438
# Overridable -- finish processing of start+end tag: <tag.../>
444439
def handle_startendtag(self, tag, attrs):

Lib/test/support/__init__.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1551,6 +1551,31 @@ def check_sizeof(test, o, size):
15511551
% (type(o), result, size)
15521552
test.assertEqual(result, size, msg)
15531553

1554+
def subTests(arg_names, arg_values, /, *, _do_cleanups=False):
1555+
"""Run multiple subtests with different parameters.
1556+
"""
1557+
single_param = False
1558+
if isinstance(arg_names, str):
1559+
arg_names = arg_names.replace(',',' ').split()
1560+
if len(arg_names) == 1:
1561+
single_param = True
1562+
arg_values = tuple(arg_values)
1563+
def decorator(func):
1564+
if isinstance(func, type):
1565+
raise TypeError('subTests() can only decorate methods, not classes')
1566+
@functools.wraps(func)
1567+
def wrapper(self, /, *args, **kwargs):
1568+
for values in arg_values:
1569+
if single_param:
1570+
values = (values,)
1571+
subtest_kwargs = dict(zip(arg_names, values))
1572+
with self.subTest(**subtest_kwargs):
1573+
func(self, *args, **kwargs, **subtest_kwargs)
1574+
if _do_cleanups:
1575+
self.doCleanups()
1576+
return wrapper
1577+
return decorator
1578+
15541579
#=======================================================================
15551580
# Decorator for running a function in a different locale, correctly resetting
15561581
# it afterwards.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy