Skip to content

Commit 0243f97

Browse files
pythongh-135661: Fix parsing start and end tags in HTMLParser according to the HTML5 standard (pythonGH-135930)
* Whitespaces no longer accepted between `</` and the tag name. E.g. `</ script>` does not end the script section. * Vertical tabulation (`\v`) and non-ASCII whitespaces no longer recognized as whitespaces. The only whitespaces are `\t\n\r\f `. * Null character (U+0000) no longer ends the tag name. * Attributes and slashes after the tag name in end tags are now ignored, instead of terminating after the first `>` in quoted attribute value. E.g. `</script/foo=">"/>`. * Multiple slashes and whitespaces between the last attribute and closing `>` are now ignored in both start and end tags. E.g. `<a foo=bar/ //>`. * Multiple `=` between attribute name and value are no longer collapsed. E.g. `<a foo==bar>` produces attribute "foo" with value "=bar". * Whitespaces between the `=` separator and attribute name or value are no longer ignored. E.g. `<a foo =bar>` produces two attributes "foo" and "=bar", both with value None; `<a foo= bar>` produces two attributes: "foo" with value "" and "bar" with value None. * Fix Sphinx errors. * Apply suggestions from code review Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com> * Address review comments. * Move to Security. --------- Co-authored-by: Ezio Melotti <ezio.melotti@gmail.com>
1 parent 938a5d7 commit 0243f97

File tree

3 files changed

+194
-129
lines changed

3 files changed

+194
-129
lines changed

Lib/html/parser.py

Lines changed: 69 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -31,15 +31,43 @@
3131
piclose = re.compile('>')
3232
commentclose = re.compile(r'--\s*>')
3333
# Note:
34-
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
35-
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
34+
# 1) if you change tagfind/attrfind remember to update locatetagend too;
35+
# 2) if you change tagfind/attrfind and/or locatetagend the parser will
3636
# explode, so don't do it.
37-
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
38-
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
39-
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
40-
attrfind_tolerant = re.compile(
41-
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
42-
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
37+
# see the HTML5 specs section "13.2.5.6 Tag open state",
38+
# "13.2.5.8 Tag name state" and "13.2.5.33 Attribute name state".
39+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
40+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
41+
# https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
42+
tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />]*)(?:[\t\n\r\f ]|/(?!>))*')
43+
attrfind_tolerant = re.compile(r"""
44+
(
45+
(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
46+
)
47+
(= # value indicator
48+
('[^']*' # LITA-enclosed value
49+
|"[^"]*" # LIT-enclosed value
50+
|(?!['"])[^>\t\n\r\f ]* # bare value
51+
)
52+
)?
53+
(?:[\t\n\r\f ]|/(?!>))* # possibly followed by a space
54+
""", re.VERBOSE)
55+
locatetagend = re.compile(r"""
56+
[a-zA-Z][^\t\n\r\f />]* # tag name
57+
[\t\n\r\f /]* # optional whitespace before attribute name
58+
(?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name
59+
(?:= # value indicator
60+
(?:'[^']*' # LITA-enclosed value
61+
|"[^"]*" # LIT-enclosed value
62+
|(?!['"])[^>\t\n\r\f ]* # bare value
63+
)
64+
)?
65+
[\t\n\r\f /]* # possibly followed by a space
66+
)*
67+
>?
68+
""", re.VERBOSE)
69+
# The following variables are not used, but are temporarily left for
70+
# backward compatibility.
4371
locatestarttagend_tolerant = re.compile(r"""
4472
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
4573
(?:[\s/]* # optional whitespace before attribute name
@@ -56,8 +84,6 @@
5684
\s* # trailing whitespace
5785
""", re.VERBOSE)
5886
endendtag = re.compile('>')
59-
# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
60-
# </ and the tag name, so maybe this should be fixed
6187
endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
6288

6389
# Character reference processing logic specific to attribute values
@@ -141,7 +167,8 @@ def get_starttag_text(self):
141167

142168
def set_cdata_mode(self, elem):
143169
self.cdata_elem = elem.lower()
144-
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
170+
self.interesting = re.compile(r'</%s(?=[\t\n\r\f />])' % self.cdata_elem,
171+
re.IGNORECASE|re.ASCII)
145172

146173
def clear_cdata_mode(self):
147174
self.interesting = interesting_normal
@@ -166,7 +193,7 @@ def goahead(self, end):
166193
# & near the end and see if it's followed by a space or ;.
167194
amppos = rawdata.rfind('&', max(i, n-34))
168195
if (amppos >= 0 and
169-
not re.compile(r'[\s;]').search(rawdata, amppos)):
196+
not re.compile(r'[\t\n\r\f ;]').search(rawdata, amppos)):
170197
break # wait till we get all the text
171198
j = n
172199
else:
@@ -310,7 +337,7 @@ def parse_html_declaration(self, i):
310337
return self.parse_bogus_comment(i)
311338

312339
# Internal -- parse bogus comment, return length or -1 if not terminated
313-
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
340+
# see https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
314341
def parse_bogus_comment(self, i, report=1):
315342
rawdata = self.rawdata
316343
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
@@ -336,6 +363,8 @@ def parse_pi(self, i):
336363

337364
# Internal -- handle starttag, return end or -1 if not terminated
338365
def parse_starttag(self, i):
366+
# See the HTML5 specs section "13.2.5.8 Tag name state"
367+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
339368
self.__starttag_text = None
340369
endpos = self.check_for_whole_start_tag(i)
341370
if endpos < 0:
@@ -381,76 +410,42 @@ def parse_starttag(self, i):
381410
# or -1 if incomplete.
382411
def check_for_whole_start_tag(self, i):
383412
rawdata = self.rawdata
384-
m = locatestarttagend_tolerant.match(rawdata, i)
385-
if m:
386-
j = m.end()
387-
next = rawdata[j:j+1]
388-
if next == ">":
389-
return j + 1
390-
if next == "/":
391-
if rawdata.startswith("/>", j):
392-
return j + 2
393-
if rawdata.startswith("/", j):
394-
# buffer boundary
395-
return -1
396-
# else bogus input
397-
if j > i:
398-
return j
399-
else:
400-
return i + 1
401-
if next == "":
402-
# end of input
403-
return -1
404-
if next in ("abcdefghijklmnopqrstuvwxyz=/"
405-
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
406-
# end of input in or before attribute value, or we have the
407-
# '/' from a '/>' ending
408-
return -1
409-
if j > i:
410-
return j
411-
else:
412-
return i + 1
413-
raise AssertionError("we should not get here!")
413+
match = locatetagend.match(rawdata, i+1)
414+
assert match
415+
j = match.end()
416+
if rawdata[j-1] != ">":
417+
return -1
418+
return j
414419

415420
# Internal -- parse endtag, return end or -1 if incomplete
416421
def parse_endtag(self, i):
422+
# See the HTML5 specs section "13.2.5.7 End tag open state"
423+
# https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
417424
rawdata = self.rawdata
418425
assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
419-
match = endendtag.search(rawdata, i+1) # >
420-
if not match:
426+
if rawdata.find('>', i+2) < 0: # fast check
421427
return -1
422-
gtpos = match.end()
423-
match = endtagfind.match(rawdata, i) # </ + tag + >
424-
if not match:
425-
if self.cdata_elem is not None:
426-
self.handle_data(rawdata[i:gtpos])
427-
return gtpos
428-
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
429-
namematch = tagfind_tolerant.match(rawdata, i+2)
430-
if not namematch:
431-
# w3.org/TR/html5/tokenization.html#end-tag-open-state
432-
if rawdata[i:i+3] == '</>':
433-
return i+3
434-
else:
435-
return self.parse_bogus_comment(i)
436-
tagname = namematch.group(1).lower()
437-
# consume and ignore other stuff between the name and the >
438-
# Note: this is not 100% correct, since we might have things like
439-
# </tag attr=">">, but looking for > after the name should cover
440-
# most of the cases and is much simpler
441-
gtpos = rawdata.find('>', namematch.end())
442-
self.handle_endtag(tagname)
443-
return gtpos+1
428+
if not endtagopen.match(rawdata, i): # </ + letter
429+
if rawdata[i+2:i+3] == '>': # </> is ignored
430+
# "missing-end-tag-name" parser error
431+
return i+3
432+
else:
433+
return self.parse_bogus_comment(i)
444434

445-
elem = match.group(1).lower() # script or style
446-
if self.cdata_elem is not None:
447-
if elem != self.cdata_elem:
448-
self.handle_data(rawdata[i:gtpos])
449-
return gtpos
435+
match = locatetagend.match(rawdata, i+2)
436+
assert match
437+
j = match.end()
438+
if rawdata[j-1] != ">":
439+
return -1
450440

451-
self.handle_endtag(elem)
441+
# find the name: "13.2.5.8 Tag name state"
442+
# https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
443+
match = tagfind_tolerant.match(rawdata, i+2)
444+
assert match
445+
tag = match.group(1).lower()
446+
self.handle_endtag(tag)
452447
self.clear_cdata_mode()
453-
return gtpos
448+
return j
454449

455450
# Overridable -- finish processing of start+end tag: <tag.../>
456451
def handle_startendtag(self, tag, attrs):

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy