From 060847b7685a1c1780521f3e5420ef772b5443ba Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Fri, 4 Jul 2025 10:00:23 +0300 Subject: [PATCH] gh-102555: Fix comment parsing in HTMLParser according to the HTML5 standard (GH-135664) * "--!>" now ends the comment. * "-- >" no longer ends the comment. * Support abnormally ended empty comments "<-->" and "<--->". --------- (cherry picked from commit 8ac7613dc8b8f82253d7c0e2b6ef6ed703a0a1ee) Co-authored-by: Serhiy Storchaka Co-author: Kerim Kabirov Co-authored-by: Ezio Melotti --- Lib/html/parser.py | 18 ++++++++++- Lib/test/test_htmlparser.py | 32 +++++++++++++++++-- ...-06-18-13-28-08.gh-issue-102555.nADrzJ.rst | 3 ++ 3 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2025-06-18-13-28-08.gh-issue-102555.nADrzJ.rst diff --git a/Lib/html/parser.py b/Lib/html/parser.py index ecd5e0f019ac96..47433bdf2d62a5 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -27,7 +27,8 @@ starttagopen = re.compile('<[a-zA-Z]') endtagopen = re.compile('') -commentclose = re.compile(r'--\s*>') +commentclose = re.compile(r'--!?>') +commentabruptclose = re.compile(r'-?>') # Note: # 1) if you change tagfind/attrfind remember to update locatestarttagend too; # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will @@ -291,6 +292,21 @@ def parse_html_declaration(self, i): else: return self.parse_bogus_comment(i) + # Internal -- parse comment, return length or -1 if not terminated + # see https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state + def parse_comment(self, i, report=True): + rawdata = self.rawdata + assert rawdata.startswith('" '' '' + '' '' + # abrupt-closing-of-empty-comment + '' + '' '' '' - '') + '' + '' + '' + '' + '' + '' + '' + # nested-comment + ' -->' + '' + '' + ) expected = [('comment', " I'm a valid comment "), ('comment', 'me too!'), ('comment', '--'), + ('comment', '-'), + ('comment', ''), + ('comment', ''), ('comment', ''), ('comment', '--I have many hyphens--'), ('comment', ' I have a > in the middle '), - ('comment', ' and I have -- in the middle! ')] + ('comment', ' and I have -- in the middle! '), + ('comment', 'incorrectly-closed-comment'), + ('comment', ''), + ('comment', '--!'), + ('comment', '-- >'), + ('comment', '-!>'), + ('comment', '!>'), + ('comment', ' '), + ('comment', '`` now ends the comment. ``-- >`` no longer ends the +comment. Support abnormally ended empty comments ``<-->`` and ``<--->``. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy