From 8222c02bf983412a9f4a37307e0e7ee6b44800e6 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Mon, 21 Jul 2025 13:07:15 +0300 Subject: [PATCH] gh-135661: Fix parsing attributes with whitespaces around the "=" separator in HTMLParser (GH-136908) This fixes a regression introduced in GH-135930. (cherry picked from commit dee650189497735edbc08a54edabb5b06ef1bd09) Co-authored-by: Serhiy Storchaka --- Lib/html/parser.py | 4 +-- Lib/test/test_htmlparser.py | 28 +++++++++++-------- ...-06-25-14-13-39.gh-issue-135661.idjQ0B.rst | 5 ---- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 9b4f09599134bd..7eea885cfe63c5 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -45,7 +45,7 @@ ( (?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name ) - (= # value indicator + ([\t\n\r\f ]*=[\t\n\r\f ]* # value indicator ('[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value @@ -57,7 +57,7 @@ [a-zA-Z][^\t\n\r\f />]* # tag name [\t\n\r\f /]* # optional whitespace before attribute name (?:(?<=['"\t\n\r\f /])[^\t\n\r\f />][^\t\n\r\f /=>]* # attribute name - (?:= # value indicator + (?:[\t\n\r\f ]*=[\t\n\r\f ]* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\t\n\r\f ]* # bare value diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 15cad061889a79..47c0752fb517b9 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -623,7 +623,7 @@ def test_correct_detection_of_start_tags(self): html = '
The rain' expected = [ - ('starttag', 'div', [('style', ''), (',', None), ('foo', None), ('=', None), ('"bar"', None)]), + ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]), ('starttag', 'b', []), ('data', 'The '), ('starttag', 'a', [('href', 'some_url')]), @@ -813,12 +813,12 @@ def test_attr_syntax(self): ] self._run_check("""""", output) self._run_check("", [('starttag', 'a', [('foo', '=bar')])]) - self._run_check("", [('starttag', 'a', [('foo', None), ('=bar', None)])]) - self._run_check("", [('starttag', 'a', [('foo', None), ('=bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) self._run_check("", [('starttag', 'a', [('foo\v', 'bar')])]) self._run_check("", [('starttag', 'a', [('foo\xa0', 'bar')])]) - self._run_check("", [('starttag', 'a', [('foo', ''), ('bar', None)])]) - self._run_check("", [('starttag', 'a', [('foo', ''), ('bar', None)])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) + self._run_check("", [('starttag', 'a', [('foo', 'bar')])]) self._run_check("", [('starttag', 'a', [('foo', '\vbar')])]) self._run_check("", [('starttag', 'a', [('foo', '\xa0bar')])]) @@ -829,8 +829,8 @@ def test_attr_values(self): ("d", "\txyz\n")])]) self._run_check("""""", [("starttag", "a", [("b", ""), ("c", "")])]) - self._run_check("", - [("starttag", "a", [("b", ""), ("c", "")])]) + self._run_check("", + [('starttag', 'a', [('b', 'x'), ('c', 'y')])]) self._run_check("", [("starttag", "a", [("b", "\v"), ("c", "\xa0")])]) # Regression test for SF patch #669683. @@ -899,13 +899,17 @@ def test_malformed_attributes(self): ) expected = [ ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), - ('data', 'test - bad1'), ('endtag', 'a'), + ('data', 'test - bad1'), + ('endtag', 'a'), ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), - ('data', 'test - bad2'), ('endtag', 'a'), + ('data', 'test - bad2'), + ('endtag', 'a'), ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]), - ('data', 'test - bad3'), ('endtag', 'a'), - ('starttag', 'a', [('href', None), ('=', None), ("test' style", 'color:red;bad4')]), - ('data', 'test - bad4'), ('endtag', 'a') + ('data', 'test - bad3'), + ('endtag', 'a'), + ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]), + ('data', 'test - bad4'), + ('endtag', 'a'), ] self._run_check(html, expected) diff --git a/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst b/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst index b6f9e104e44047..27e886abdb58e5 100644 --- a/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst +++ b/Misc/NEWS.d/next/Security/2025-06-25-14-13-39.gh-issue-135661.idjQ0B.rst @@ -18,8 +18,3 @@ according to the HTML5 standard. * Multiple ``=`` between attribute name and value are no longer collapsed. E.g. ```` produces attribute "foo" with value "=bar". - -* Whitespaces between the ``=`` separator and attribute name or value are no - longer ignored. E.g. ```` produces two attributes "foo" and - "=bar", both with value None; ```` produces two attributes: - "foo" with value "" and "bar" with value None. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy