From f7f9f562f1b31c2130e26269cf4f196f378d80f2 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 18 Jun 2025 13:34:58 +0300 Subject: [PATCH 1/4] gh-135661: Fix CDATA section parsing in HTMLParser "] ]>" and "]] >" no longer end the CDATA section. --- Lib/html/parser.py | 6 ++- Lib/test/test_htmlparser.py | 42 +++++++++---------- ...-06-18-13-34-55.gh-issue-135661.NZlpWf.rst | 2 + 3 files changed, 28 insertions(+), 22 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index ba416e7fa6e3fe..99aebc19d4a2e3 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -298,7 +298,11 @@ def parse_html_declaration(self, i): # this case is actually already handled in goahead() return self.parse_comment(i) elif rawdata[i:i+9] == '') + if j < 0: + return -1 + self.unknown_decl(rawdata[i+3: j]) + return j + 3 elif rawdata[i:i+9].lower() == ' gtpos = rawdata.find('>', i+9) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 65a4bee72b9775..b75b4c711ccac5 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -686,27 +686,27 @@ def test_broken_condcoms(self): ] self._run_check(html, expected) - def test_cdata_declarations(self): - # More tests should be added. See also "8.2.4.42. Markup - # declaration open state", "8.2.4.69. CDATA section state", - # and issue 32876 - html = ('') - expected = [('unknown decl', 'CDATA[just some plain text')] - self._run_check(html, expected) - - def test_cdata_declarations_multiline(self): - html = (' b) {' - ' printf("[How?]");' - ' }' - ']]>') - expected = [ - ('starttag', 'code', []), - ('unknown decl', - 'CDATA[ if (a < b && a > b) { ' - 'printf("[How?]"); }'), - ('endtag', 'code') - ] + @support.subTests('content', [ + 'just some plain text', + '', + '¬-an-entity-ref;', + "", + '', + '[[I have many brackets]]', + 'I have a > in the middle', + 'I have a ]] in the middle', + '] ]>', + ']] >', + ('\n' + ' if (a < b && a > b) {\n' + ' printf("[How?]");\n' + ' }\n'), + ]) + def test_cdata_section(self, content): + # See "13.2.5.42 Markup declaration open state", + # "13.2.5.69 CDATA section state", and issue bpo-32876. + html = f'' + expected = [('unknown decl', 'CDATA[' + content)] self._run_check(html, expected) def test_convert_charrefs_dropped_text(self): diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst new file mode 100644 index 00000000000000..7a07e8535bb497 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst @@ -0,0 +1,2 @@ +Fix CDATA section parsing in :class:`html.parser.HTMLParser`: ``] ]>`` and +``]] >`` no longer end the CDATA section. From cf918e3718227dbdda763720f2553b41711dda0e Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 3 Jul 2025 18:17:26 +0300 Subject: [PATCH 2/4] Move to Security. --- .../2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Misc/NEWS.d/next/{Library => Security}/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst (100%) diff --git a/Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst similarity index 100% rename from Misc/NEWS.d/next/Library/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst rename to Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst From d346c10f25179eaf333cbb38a7b86dd937556da7 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 4 Jul 2025 09:10:35 +0300 Subject: [PATCH 3/4] Update 2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst --- .../Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst index 7a07e8535bb497..59c76d50f79443 100644 --- a/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst +++ b/Misc/NEWS.d/next/Security/2025-06-18-13-34-55.gh-issue-135661.NZlpWf.rst @@ -1,2 +1,2 @@ -Fix CDATA section parsing in :class:`html.parser.HTMLParser`: ``] ]>`` and -``]] >`` no longer end the CDATA section. +Fix CDATA section parsing in :class:`html.parser.HTMLParser` according to +the HTML5 standard: ``] ]>`` and ``]] >`` no longer end the CDATA section. From 524cac599dc5554650e6f1a8c81d808fa8ef54d6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Sat, 5 Jul 2025 15:54:03 +0300 Subject: [PATCH 4/4] * Make CDATA section parsing context depending. * Add HTMLParser.support_cdata(). --- Doc/library/html.parser.rst | 11 ++++++++ Lib/html/parser.py | 22 ++++++++++----- Lib/test/test_htmlparser.py | 54 ++++++++++++++++++++++++++++++++----- 3 files changed, 74 insertions(+), 13 deletions(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index dd67fc34e856f1..b2fa043625c7e1 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -121,6 +121,17 @@ The output will then be: attributes can be preserved, etc.). +.. method:: HTMLParser.support_cdata(flag) + + Sets how the parser will parse CDATA declarations. + If *flag* is true, then the :meth:`unknown_decl` method will be called + for the CDATA section ````. + If *flag* is false, then the :meth:`handle_comment` method will be called + for ````. + + .. versionadded:: 3.13.6 + + The following methods are called when data or markup elements are encountered and they are meant to be overridden in a subclass. The base class implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): diff --git a/Lib/html/parser.py b/Lib/html/parser.py index d405d653f45270..88a084dcf1ce7d 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -144,6 +144,7 @@ def reset(self): self.lasttag = '???' self.interesting = interesting_normal self.cdata_elem = None + self._support_cdata = False super().reset() def feed(self, data): @@ -174,6 +175,9 @@ def clear_cdata_mode(self): self.interesting = interesting_normal self.cdata_elem = None + def support_cdata(self, flag=True): + self._support_cdata = flag + # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. @@ -249,7 +253,10 @@ def goahead(self, end): break self.handle_comment(rawdata[i+4:j]) elif startswith("') - if j < 0: - return -1 - self.unknown_decl(rawdata[i+3: j]) - return j + 3 + if self._support_cdata: + j = rawdata.find(']]>', i+9) + if j < 0: + return -1 + self.unknown_decl(rawdata[i+3: j]) + return j + 3 + else: + return self.parse_bogus_comment(i) elif rawdata[i:i+9].lower() == ' gtpos = rawdata.find('>', i+9) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index f913732c0b13d1..65fbf5d7b618fd 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -34,12 +34,16 @@ def get_events(self): def handle_starttag(self, tag, attrs): self.append(("starttag", tag, attrs)) + if tag == 'svg': + self.support_cdata(True) def handle_startendtag(self, tag, attrs): self.append(("startendtag", tag, attrs)) def handle_endtag(self, tag): self.append(("endtag", tag)) + if tag == 'svg': + self.support_cdata(False) # all other markup @@ -643,10 +647,22 @@ def test_eof_in_declarations(self): ('How?]");\n' ' }\n'), ]) - def test_cdata_section(self, content): + def test_cdata_section_content(self, content): # See "13.2.5.42 Markup declaration open state", # "13.2.5.69 CDATA section state", and issue bpo-32876. - html = f'' - expected = [('unknown decl', 'CDATA[' + content)] + html = f'{content}' + expected = [ + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[' + content), + ('endtag', 'text'), + ('endtag', 'svg'), + ] + self._run_check(html, expected) + + def test_cdata_section(self): + # See "13.2.5.42 Markup declaration open state". + html = ('bar]]>' + 'foo<br>bar' + 'bar]]>') + expected = [ + ('comment', '[CDATA[foo'), + ('starttag', 'svg', []), + ('starttag', 'text', [('y', '100')]), + ('unknown decl', 'CDATA[foo
bar'), + ('endtag', 'text'), + ('endtag', 'svg'), + ('comment', '[CDATA[foo'), + ] self._run_check(html, expected) def test_convert_charrefs_dropped_text(self): pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy