Content-Length: 15256 | pFad | http://github.com/python/cpython/pull/132574.patch

thub.com From f0d20b254e2444632cdaea2552078ef5399ed0ad Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Tue, 15 Apr 2025 20:56:59 +1200 Subject: [PATCH 1/7] gh-127971: do not read past the end of a string Fix off-by-one read beyond the end of a string. --- Objects/stringlib/fastsearch.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 05e700b06258f0..520688c71b8d8b 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -595,7 +595,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, continue; } /* miss: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + if (i < w && !STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } else { @@ -604,7 +604,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } else { /* skip: check if next character is part of pattern */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + if (i < w && !STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } } From 462dddf4f89b611242fa5eafaa9098383c1ff074 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Wed, 16 Apr 2025 12:11:50 +1200 Subject: [PATCH 2/7] Add test case --- Lib/test/string_tests.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 4b82d51b4508ac..afa75411f982fd 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -767,6 +767,13 @@ def test_replace(self): self.checkraises(TypeError, 'hello', 'replace', 42, 'h') self.checkraises(TypeError, 'hello', 'replace', 'h', 42) + # gh-127971 + any_3_nonblank_codepoints = '!!!' + seven_codepoints = any_3_nonblank_codepoints + ' ' + any_3_nonblank_codepoints + a = (' ' * 243) + seven_codepoints + (' ' * 7) + b = ' ' * 6 + chr(256) + a.replace(seven_codepoints, b) + def test_replace_uses_two_way_maxcount(self): # Test that maxcount works in _two_way_count in fastsearch.h A, B = "A"*1000, "B"*1000 From 1d0210a8e252cdb5102ecd326003301c30da5f1d Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Wed, 16 Apr 2025 12:01:16 +1200 Subject: [PATCH 3/7] Add blurb --- .../2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst new file mode 100644 index 00000000000000..a47268671e60ac --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst @@ -0,0 +1 @@ +Fix off-by-one read beyond the end of a string in string search From 212452d7efca29140bd2066b7d522aaf26f43b7f Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Sat, 21 Jun 2025 12:23:36 +1200 Subject: [PATCH 4/7] Add test case for adaptive_find and comments better explaining the reasons for and limitations of the tests. --- Lib/test/string_tests.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index afa75411f982fd..6e6dcacceac1c4 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -767,13 +767,40 @@ def test_replace(self): self.checkraises(TypeError, 'hello', 'replace', 42, 'h') self.checkraises(TypeError, 'hello', 'replace', 'h', 42) - # gh-127971 + def test_replacement_on_buffer_boundary(self): + + # gh-127971: Check we don't read past the end of the buffer when a + # potential match misses on the last character. Note this will likely + # not cause a failure unless ASAN is enabled, and even that may be + # dependent on implementation details subject to change. any_3_nonblank_codepoints = '!!!' seven_codepoints = any_3_nonblank_codepoints + ' ' + any_3_nonblank_codepoints a = (' ' * 243) + seven_codepoints + (' ' * 7) b = ' ' * 6 + chr(256) a.replace(seven_codepoints, b) + def test_adaptive_find_on_buffer_boundary(self): + + # gh-127971: This exercises the adaptive search algorithm to trigger a + # corner-case where it might examine the character *after* the last + # position that could be the start of the pattern. + # + # Unfortunately there is nothing to *test* to confirm whether the + # character is read or not, nor in fact does it matter for correctness + # with the implementation at time of writing: the adaptive algorithm is + # only triggered if the input is over a certain size and with a pattern + # with more than one character, so with the current implementation even + # though the final character read is not necessary or significant, it + # won't cause a fault. + # + # This test at least intentionally exercises this path, and might + # possibly catch a regression if the implementation changes and breaks + # those assumptions. + prefix = ' ' * (1024 * 4) + haystack = prefix + 'x' + needle = prefix + 'y' + self.assertEqual(haystack.find(needle), -1) + def test_replace_uses_two_way_maxcount(self): # Test that maxcount works in _two_way_count in fastsearch.h A, B = "A"*1000, "B"*1000 From c07c23eb921bdb737cbf9fccd60bbdde32426f7d Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Sun, 22 Jun 2025 11:49:30 +1200 Subject: [PATCH 5/7] Tweak conditional phrasing to match loop terminating criteria, add comment explaining why a guard is not necessary in adaptive_find. --- Objects/stringlib/fastsearch.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 520688c71b8d8b..369034d52e8619 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -595,7 +595,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, continue; } /* miss: check if next character is part of pattern */ - if (i < w && !STRINGLIB_BLOOM(mask, ss[i+1])) { + if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } else { @@ -604,7 +604,7 @@ STRINGLIB(default_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } else { /* skip: check if next character is part of pattern */ - if (i < w && !STRINGLIB_BLOOM(mask, ss[i+1])) { + if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } } @@ -667,7 +667,16 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, return res + count; } } - /* miss: check if next character is part of pattern */ + + /* Miss: check if next character is part of pattern. + Note that in contrast to default_find and default_rfind we do + *not* need to prevent the algorithm from reading one character + beyond the last character in the input that the pattern could + start in. I.e. if i == w it is safe to read ss[i + 1] since the + input and pattern length requirements on when this variant + algorithm will be called ensure it will always be a valid part + of the input. In that case it doesn't matter what the character + read is since the loop will terminate regardless. */ if (!STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } @@ -676,7 +685,9 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } } else { - /* skip: check if next character is part of pattern */ + /* Skip: check if next character is part of pattern. + See comment above re safety of accessing ss[i+1] when i == w. + */ if (!STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } From 338f32d9908e080084a5c0e99dbb99f3c4e1ace6 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Sun, 13 Jul 2025 11:33:04 +1200 Subject: [PATCH 6/7] Address more reviewer feedback: add conditional to adaptive find, tweak comments, drop extremely marginal "test" --- Lib/test/string_tests.py | 29 ++----------------- ...-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst | 2 +- Objects/stringlib/fastsearch.h | 19 +++--------- 3 files changed, 8 insertions(+), 42 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 6e6dcacceac1c4..5cd3d7a62274bf 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -4,6 +4,7 @@ import unittest, string, sys, struct from test import support +from test.support import check_sanitizer from test.support import import_helper from collections import UserList import random @@ -767,40 +768,16 @@ def test_replace(self): self.checkraises(TypeError, 'hello', 'replace', 42, 'h') self.checkraises(TypeError, 'hello', 'replace', 'h', 42) + @unittest.skipUnless(check_sanitizer(address=True), "AddressSanitizer required") def test_replacement_on_buffer_boundary(self): - # gh-127971: Check we don't read past the end of the buffer when a - # potential match misses on the last character. Note this will likely - # not cause a failure unless ASAN is enabled, and even that may be - # dependent on implementation details subject to change. + # potential match misses on the last character. any_3_nonblank_codepoints = '!!!' seven_codepoints = any_3_nonblank_codepoints + ' ' + any_3_nonblank_codepoints a = (' ' * 243) + seven_codepoints + (' ' * 7) b = ' ' * 6 + chr(256) a.replace(seven_codepoints, b) - def test_adaptive_find_on_buffer_boundary(self): - - # gh-127971: This exercises the adaptive search algorithm to trigger a - # corner-case where it might examine the character *after* the last - # position that could be the start of the pattern. - # - # Unfortunately there is nothing to *test* to confirm whether the - # character is read or not, nor in fact does it matter for correctness - # with the implementation at time of writing: the adaptive algorithm is - # only triggered if the input is over a certain size and with a pattern - # with more than one character, so with the current implementation even - # though the final character read is not necessary or significant, it - # won't cause a fault. - # - # This test at least intentionally exercises this path, and might - # possibly catch a regression if the implementation changes and breaks - # those assumptions. - prefix = ' ' * (1024 * 4) - haystack = prefix + 'x' - needle = prefix + 'y' - self.assertEqual(haystack.find(needle), -1) - def test_replace_uses_two_way_maxcount(self): # Test that maxcount works in _two_way_count in fastsearch.h A, B = "A"*1000, "B"*1000 diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst index a47268671e60ac..ced7a9c9fd3e63 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-04-16-12-01-13.gh-issue-127971.pMDOQ0.rst @@ -1 +1 @@ -Fix off-by-one read beyond the end of a string in string search +Fix off-by-one read beyond the end of a string in string search. diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h index 369034d52e8619..b447865c986bef 100644 --- a/Objects/stringlib/fastsearch.h +++ b/Objects/stringlib/fastsearch.h @@ -667,17 +667,8 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, return res + count; } } - - /* Miss: check if next character is part of pattern. - Note that in contrast to default_find and default_rfind we do - *not* need to prevent the algorithm from reading one character - beyond the last character in the input that the pattern could - start in. I.e. if i == w it is safe to read ss[i + 1] since the - input and pattern length requirements on when this variant - algorithm will be called ensure it will always be a valid part - of the input. In that case it doesn't matter what the character - read is since the loop will terminate regardless. */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + /* miss: check if next character is part of pattern */ + if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } else { @@ -685,10 +676,8 @@ STRINGLIB(adaptive_find)(const STRINGLIB_CHAR* s, Py_ssize_t n, } } else { - /* Skip: check if next character is part of pattern. - See comment above re safety of accessing ss[i+1] when i == w. - */ - if (!STRINGLIB_BLOOM(mask, ss[i+1])) { + /* skip: check if next character is part of pattern */ + if (i + 1 <= w && !STRINGLIB_BLOOM(mask, ss[i+1])) { i = i + m; } } From 9884ec1af43684b1127e416b2980f3074214b7d6 Mon Sep 17 00:00:00 2001 From: Duane Griffin Date: Mon, 14 Jul 2025 00:17:53 +1200 Subject: [PATCH 7/7] Always run test, not just under ASAN --- Lib/test/string_tests.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 5cd3d7a62274bf..1814a55b74ea0c 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -4,7 +4,6 @@ import unittest, string, sys, struct from test import support -from test.support import check_sanitizer from test.support import import_helper from collections import UserList import random @@ -768,7 +767,6 @@ def test_replace(self): self.checkraises(TypeError, 'hello', 'replace', 42, 'h') self.checkraises(TypeError, 'hello', 'replace', 'h', 42) - @unittest.skipUnless(check_sanitizer(address=True), "AddressSanitizer required") def test_replacement_on_buffer_boundary(self): # gh-127971: Check we don't read past the end of the buffer when a # potential match misses on the last character.

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier! Saves Data!