diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index dd67fc34e856f1..b2fa043625c7e1 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -121,6 +121,17 @@ The output will then be: attributes can be preserved, etc.). +.. method:: HTMLParser.support_cdata(flag) + + Sets how the parser will parse CDATA declarations. + If *flag* is true, then the :meth:`unknown_decl` method will be called + for the CDATA section ````. + If *flag* is false, then the :meth:`handle_comment` method will be called + for ````. + + .. versionadded:: 3.13.6 + + The following methods are called when data or markup elements are encountered and they are meant to be overridden in a subclass. The base class implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): diff --git a/Lib/html/parser.py b/Lib/html/parser.py index cc15de07b5bae6..88a084dcf1ce7d 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -144,6 +144,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._support_cdata = False super().reset() def feed(self, data): @@ -174,6 +175,9 @@ def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None + def support_cdata(self, flag=True): + self._support_cdata = flag + # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. @@ -249,7 +253,10 @@ def goahead(self, end): break self.handle_comment(rawdata[i+4:j]) elif startswith("', i+9) + if j < 0: + return -1 + self.unknown_decl(rawdata[i+3: j]) + return j + 3 + else: + return self.parse_bogus_comment(i) elif rawdata[i:i+9].lower() == ' gtpos = rawdata.find('>', i+9) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index d0d2c54217ccaf..65fbf5d7b618fd 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -34,12 +34,16 @@ def get_events(self): def handle_starttag(self, tag, attrs): self.append(("starttag", tag, attrs)) + if tag == 'svg': + self.support_cdata(True) def handle_startendtag(self, tag, attrs): self.append(("startendtag", tag, attrs)) def handle_endtag(self, tag): self.append(("endtag", tag)) + if tag == 'svg': + self.support_cdata(False) # all other markup @@ -643,10 +647,22 @@ def test_eof_in_declarations(self): ('') - expected = [('unknown decl', 'CDATA[just some plain text')] + @support.subTests('content', [ + 'just some plain text', + '', + '¬-an-entity-ref;', + "", + '', + '[[I have many brackets]]', + 'I have a > in the middle', + 'I have a ]] in the middle', + '] ]>', + ']] >', + ('\n' + ' if (a < b && a > b) {\n' + ' printf("[How?]");\n' + ' }\n'), + ]) + def test_cdata_section_content(self, content): + # See "13.2.5.42 Markup declaration open state", + # "13.2.5.69 CDATA section state", and issue bpo-32876. + html = f'{content}' + expected = [ + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[' + content), + ('endtag', 'text'), + ('endtag', 'svg'), + ] self._run_check(html, expected) - def test_cdata_declarations_multiline(self): - html = (' b) {' - ' printf("[How?]");' - ' }' - ']]>') + def test_cdata_section(self): + # See "13.2.5.42 Markup declaration open state". + html = ('bar]]>' + 'foo<br>bar' + 'bar]]>') expected = [ - ('starttag', 'code', []), - ('unknown decl', - 'CDATA[ if (a < b && a > b) { ' - 'printf("[How?]"); }'), - ('endtag', 'code') + ('comment', '[CDATA[foo'), + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[foo
bar'), + ('endtag', 'text'), + ('endtag', 'svg'), + ('comment', '[CDATA[foo'), ] self._run_check(html, expected) diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst new file mode 100644 index 00000000000000..59c76d50f79443 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst @@ -0,0 +1,2 @@ +Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to +the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy