Skip to content

Commit c0ece3d

Browse files
authored
GH-102613: Improve performance of pathlib.Path.rglob() (GH-104244)
Stop de-duplicating results in `_RecursiveWildcardSelector`. A new `_DoubleRecursiveWildcardSelector` class is introduced which performs de-duplication, but this is used _only_ for patterns with multiple non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding the use of a set, `PurePath.__hash__()` is not called, and so paths do not need to be stringified and case-normalised. Also merge adjacent '**' segments in patterns.
1 parent 8d95012 commit c0ece3d

File tree

3 files changed

+45
-18
lines changed

3 files changed

+45
-18
lines changed

Lib/pathlib.py

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
6464
@functools.lru_cache()
6565
def _make_selector(pattern_parts, flavour, case_sensitive):
6666
pat = pattern_parts[0]
67-
child_parts = pattern_parts[1:]
6867
if not pat:
6968
return _TerminatingSelector()
7069
if pat == '**':
71-
cls = _RecursiveWildcardSelector
72-
elif pat == '..':
73-
cls = _ParentSelector
74-
elif '**' in pat:
75-
raise ValueError("Invalid pattern: '**' can only be an entire path component")
70+
child_parts_idx = 1
71+
while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
72+
child_parts_idx += 1
73+
child_parts = pattern_parts[child_parts_idx:]
74+
if '**' in child_parts:
75+
cls = _DoubleRecursiveWildcardSelector
76+
else:
77+
cls = _RecursiveWildcardSelector
7678
else:
77-
cls = _WildcardSelector
79+
child_parts = pattern_parts[1:]
80+
if pat == '..':
81+
cls = _ParentSelector
82+
elif '**' in pat:
83+
raise ValueError("Invalid pattern: '**' can only be an entire path component")
84+
else:
85+
cls = _WildcardSelector
7886
return cls(pat, child_parts, flavour, case_sensitive)
7987

8088

@@ -183,20 +191,32 @@ def _iterate_directories(self, parent_path, scandir):
183191

184192
def _select_from(self, parent_path, scandir):
185193
try:
186-
yielded = set()
187-
try:
188-
successor_select = self.successor._select_from
189-
for starting_point in self._iterate_directories(parent_path, scandir):
190-
for p in successor_select(starting_point, scandir):
191-
if p not in yielded:
192-
yield p
193-
yielded.add(p)
194-
finally:
195-
yielded.clear()
194+
successor_select = self.successor._select_from
195+
for starting_point in self._iterate_directories(parent_path, scandir):
196+
for p in successor_select(starting_point, scandir):
197+
yield p
196198
except PermissionError:
197199
return
198200

199201

202+
class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
203+
"""
204+
Like _RecursiveWildcardSelector, but also de-duplicates results from
205+
successive selectors. This is necessary if the pattern contains
206+
multiple non-adjacent '**' segments.
207+
"""
208+
209+
def _select_from(self, parent_path, scandir):
210+
yielded = set()
211+
try:
212+
for p in super()._select_from(parent_path, scandir):
213+
if p not in yielded:
214+
yield p
215+
yielded.add(p)
216+
finally:
217+
yielded.clear()
218+
219+
200220
#
201221
# Public API
202222
#

Lib/test/test_pathlib.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1853,13 +1853,14 @@ def _check(path, pattern, case_sensitive, expected):
18531853

18541854
def test_rglob_common(self):
18551855
def _check(glob, expected):
1856-
self.assertEqual(set(glob), { P(BASE, q) for q in expected })
1856+
self.assertEqual(sorted(glob), sorted(P(BASE, q) for q in expected))
18571857
P = self.cls
18581858
p = P(BASE)
18591859
it = p.rglob("fileA")
18601860
self.assertIsInstance(it, collections.abc.Iterator)
18611861
_check(it, ["fileA"])
18621862
_check(p.rglob("fileB"), ["dirB/fileB"])
1863+
_check(p.rglob("**/fileB"), ["dirB/fileB"])
18631864
_check(p.rglob("*/fileA"), [])
18641865
if not os_helper.can_symlink():
18651866
_check(p.rglob("*/fileB"), ["dirB/fileB"])
@@ -1883,9 +1884,12 @@ def _check(glob, expected):
18831884
_check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
18841885
"dirC/dirD", "dirC/dirD/fileD"])
18851886
_check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
1887+
_check(p.rglob("**/file*"), ["dirC/fileC", "dirC/dirD/fileD"])
1888+
_check(p.rglob("dir*/**"), ["dirC/dirD"])
18861889
_check(p.rglob("*/*"), ["dirC/dirD/fileD"])
18871890
_check(p.rglob("*/"), ["dirC/dirD"])
18881891
_check(p.rglob(""), ["dirC", "dirC/dirD"])
1892+
_check(p.rglob("**"), ["dirC", "dirC/dirD"])
18891893
# gh-91616, a re module regression
18901894
_check(p.rglob("*.txt"), ["dirC/novel.txt"])
18911895
_check(p.rglob("*.*"), ["dirC/novel.txt"])
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Improve performance of :meth:`pathlib.Path.glob` when expanding recursive
2+
wildcards ("``**``") by merging adjacent wildcards and de-duplicating
3+
results only when necessary.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy