-
-
Notifications
You must be signed in to change notification settings - Fork 32.4k
GH-115512: Optimize peak memory usage and runtime for large emails #132709
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
da36214
c2eb551
d6233e9
530f6d4
13ebb39
2f6002e
4fa6755
4f36227
d8fa697
cf56ff2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,10 +112,10 @@ def _flush_partial(self): | |
if not line: | ||
pass | ||
elif self._dump_destination is None: | ||
# We're not dumping data. Just flush the partial to lines, as normal | ||
# We're not dumping data. Just flush the partial to lines | ||
self._lines.append(line) | ||
elif self._check_eofstack(line): | ||
# We were dumping, but we've now reached the end of the dump. Push our line and stop dumping. | ||
# We were dumping, but we've now reached the end of the dump. | ||
self._dump_destination = None | ||
self._lines.append(line) | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my personal style, I'm not like to use
So I prefer to create a enum here to represent four different status and use a match case statement here. I think this will help us to sementic the status and make the code more readable |
||
|
@@ -130,7 +130,6 @@ def push(self, data): | |
if not data: | ||
return | ||
|
||
# If we're dumping, and we don't have anything that will ever tell us to terminate, simply dump everything | ||
if self._can_dump_data(data): | ||
self._dump_destination.append(data) | ||
return | ||
|
@@ -139,12 +138,11 @@ def push(self, data): | |
|
||
def _can_dump_data(self, data): | ||
if self._dump_destination is None: | ||
# We're not dumping data | ||
return False | ||
|
||
# We're dumping; check for easy optimizations | ||
JAJames marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if not self._eofstack: | ||
# There's nothing that will ever tell us to stop dumping. Go ahead and dump the entire `data` object. | ||
# There's nothing that will ever tell us to stop dumping. | ||
# This does absolute wonders for large non-multipart emails. | ||
assert not self._lines | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure we need to use the assert here. Normally the assert will raise an exception here. So in other words, I'm not sure we need to raise an exception here. If we don't need to raise an exception here, I think we simplify the code here as If we need to raise an exception here, I think it would be better to cumtomize a new Exception and raise it with some meaningful message not just a simple assert error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The assertion here is to protect against future changes, as it would be indicative of a programmatic error in the email library specifically. If The assertion itself could be removed entirely, as it should never fail, but I fear that could introduce risk of programmatic errors later. Edit: forgot to mention the |
||
assert not self._dangling_partial | ||
|
@@ -155,56 +153,44 @@ def _can_dump_data(self, data): | |
if self._partial: | ||
return False | ||
|
||
all_boundary_matches = True | ||
for pred in self._eofstack: | ||
if not hasattr(pred, 'is_boundary_match'): | ||
all_boundary_matches = False | ||
break | ||
|
||
if all_boundary_matches and '-' not in data: | ||
# We eventually need to stop, but we only care about boundary matches, and there's no boundaries | ||
# here. Dump the entire `data` object. This does wonders for multipart emails with large parts. | ||
assert not self._lines | ||
return True | ||
|
||
# We're still dumping, but there's a potential boundary marker or EOF or similar issue. Force a proper parse. | ||
return False | ||
# We can't blindly dump entire chunks, if we're interested in | ||
# more than just boundaries | ||
return False | ||
|
||
# We only care about boundaries; we can dump as long as there's no | ||
# potential boundaries. | ||
return '-' not in data | ||
|
||
def _can_dump_partial(self, line, start=0, end=sys.maxsize): | ||
# Very similar to _can_dump_data above, except we can make some additional assumptions for partials/lines. | ||
# This should only ever be checked when we have a new partial line, in which case we have no partial, | ||
# or when checking the partial itself, in which case it'll always be the first part | ||
# Very similar to _can_dump_data above, except we can make some | ||
# additional assumptions for partials/lines. | ||
assert not self._partial or line is self._partial[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same with previous comment |
||
|
||
if self._dump_destination is None: | ||
# We're not dumping data | ||
return False | ||
|
||
# We're dumping. There should be absolutely no other pending lines, because those should've been dumped. | ||
# We're dumping. There should be absolutely no other pending lines, | ||
# because those should've been dumped. | ||
assert not self._lines | ||
if not self._eofstack: | ||
# There's nothing that will ever tell us to stop dumping. Dump away. | ||
# There's nothing that will ever tell us to stop dumping. Dump away | ||
return True | ||
|
||
all_boundary_matches = True | ||
JAJames marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for pred in self._eofstack: | ||
if not hasattr(pred, 'is_boundary_match'): | ||
all_boundary_matches = False | ||
break | ||
|
||
if all_boundary_matches and not line.startswith("-", start, end): | ||
# We eventually need to stop, but we only care about boundary matches, and there's no boundaries | ||
# here. Dump the entire `data` object. This does wonders for multipart emails with large parts. | ||
return True | ||
return False | ||
|
||
# We're still dumping, but there's a potential boundary marker or EOF or similar issue. Force a proper parse. | ||
return False | ||
# We only care about boundaries; we can dump as long as there's no | ||
# potential boundaries. | ||
return not line.startswith("-", start, end) | ||
|
||
def _is_dump_midline(self): | ||
if not self._dump_destination: | ||
return False | ||
|
||
assert self._dump_destination[-1] # Never push empty strings to _dump_destination | ||
return self._dump_destination[-1][-1] not in ('\n', '\r') | ||
|
||
def _push_data(self, data): | ||
|
@@ -214,8 +200,9 @@ def _push_data(self, data): | |
# No new complete lines, wait for more. | ||
# Check to see if we had a previous dangling partial newline | ||
if self._dangling_partial: | ||
# We previously pushed a dangling line expecting a \n to follow, however we received other data instead. | ||
# Therefore, that \r does actually terminate a line. Go ahead and push it. | ||
# We previously pushed a dangling line expecting \n to follow, | ||
# however we received other data instead. Therefore, that \r | ||
# does actually terminate a line. Go ahead and push it. | ||
self._flush_partial() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One way to reduce indentation, and make it easier to read perhaps, is to actually handle |
||
|
||
# No lines in data to push; wait for more data | ||
|
@@ -230,24 +217,23 @@ def _push_data(self, data): | |
|
||
JAJames marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# Complete our previous/partial line | ||
JAJames marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if self._partial: | ||
# Check to see if we had any dangling newlines in our partial, and handle if appropriate | ||
if self._dangling_partial: | ||
# We had a previously dangling line; this is either a \n (completion), or some other char (termination) | ||
if data[0] != NL: | ||
# "\r<whatever>" -- push what we had, as it has been terminated; data_start_index = 0 | ||
# "\r<whatever>" -- push what we had, it's been terminated | ||
self._flush_partial() | ||
else: | ||
# "\r\n" -- append \n and push it; data_start_index = 1 | ||
# "\r\n" -- append \n to complete it and push | ||
self._partial.append(NL) | ||
self._flush_partial() | ||
data_start_index = 1 | ||
|
||
# Find the next newline | ||
JAJames marked this conversation as resolved.
Show resolved
Hide resolved
|
||
unl_start_index = BufferedSubFile._find_unl(data, data_start_index) | ||
# Fall through | ||
unl_start_index = BufferedSubFile._find_unl( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can invoke static methods with |
||
data, data_start_index) | ||
else: | ||
# Our partial has no dangling newline; complete our partial with the new line and push it | ||
unl_end_index = BufferedSubFile._find_unl_end(data, unl_start_index) | ||
# Complete our partial with the new line and push it | ||
unl_end_index = BufferedSubFile._find_unl_end( | ||
data, unl_start_index) | ||
if unl_end_index < 0: | ||
# The newline is incomplete; append data and return | ||
self._partial.append(data) | ||
|
@@ -260,36 +246,37 @@ def _push_data(self, data): | |
data_start_index = unl_end_index | ||
|
||
# Find the next newline | ||
unl_start_index = BufferedSubFile._find_unl(data, data_start_index) | ||
# Fall through | ||
unl_start_index = BufferedSubFile._find_unl( | ||
data, data_start_index) | ||
|
||
# _partial is now guaranteed to point to be empty | ||
# data_start_index is an index which points to the start of the next line | ||
# unl_start_index is an index which points to the start of the next newline character, if there is one | ||
# data_start_index is an index which points to the start of next line | ||
# unl_start_index is the start of the next newline character, or -1 | ||
self._push_data_no_partial(data, data_start_index, unl_start_index) | ||
|
||
def _push_data_no_partial(self, data, data_start_index, unl_start_index): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is very hard to understand: I suggest having dedicated functions for each of the cases as well. |
||
# _partial is now guaranteed to point to be empty | ||
# data_start_index is an index which points to the start of the next line | ||
# unl_start_index is an index which points to the start of the next newline character, if there is one | ||
|
||
# Process any remaining whole lines in data | ||
if unl_start_index < 0: | ||
# Push right to the partial if there's no lines | ||
if data_start_index < len(data): | ||
assert data_start_index >= 0 | ||
partial_line = data[data_start_index:] | ||
if self._is_dump_midline() or self._can_dump_partial(partial_line): | ||
if self._is_dump_midline() \ | ||
or self._can_dump_partial(partial_line): | ||
self._dump_destination.append(partial_line) | ||
else: | ||
self._partial = [partial_line] | ||
if data[-1] == '\r': | ||
self._dangling_partial = True | ||
elif self._dump_destination is None and unl_start_index < len(data) // 2: | ||
# If it looks like we're going to be doing a lot of splits/joins, just go ahead and use StringIO, for speed | ||
# If we had some sort of "StringViewIO" to avoid the copy, this would be significantly more efficient | ||
# This code block, and the "else" code block below, functionally do the exact same thing, except this path | ||
# makes no attempt to handle dumping data | ||
elif self._dump_destination is None \ | ||
and unl_start_index < len(data) // 2: | ||
# If it looks like we're going to be doing a lot of splits/joins, | ||
# just go ahead and use StringIO, for speed | ||
# If we had some sort of "StringViewIO" to avoid the copy, this | ||
# would be significantly more efficient | ||
# This code block, and the "else" code block below, functionally do | ||
# the exact same thing, except this path makes no attempt to handle | ||
# dumping data | ||
sio = StringIO(data, '') | ||
sio.seek(data_start_index) | ||
lines = sio.readlines() | ||
|
@@ -301,26 +288,28 @@ def _push_data_no_partial(self, data, data_start_index, unl_start_index): | |
|
||
JAJames marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self.pushlines(lines) | ||
else: | ||
# If we're not, let's keep it in Python | ||
dump_data_start = None if self._dump_destination is None else data_start_index | ||
dump_data_start = None if self._dump_destination is None \ | ||
else data_start_index | ||
while unl_start_index >= 0: | ||
unl_end_index = BufferedSubFile._find_unl_end(data, unl_start_index) | ||
unl_end_index = BufferedSubFile._find_unl_end( | ||
data, unl_start_index) | ||
if unl_end_index < 0: | ||
# Incomplete line ending; break to update our partial and return | ||
# Incomplete line ending; break to just update our partial | ||
self._dangling_partial = True | ||
break | ||
|
||
# We have an easy line; push it | ||
if self._dump_destination is not None: | ||
# We have a window into a line. Make sure it's not EOF, and continue as long as it's not | ||
if self._check_eofstack(data, data_start_index, unl_end_index): | ||
# This line is "EOF". This is the end of our dump data! Push the dump data. | ||
self._dump_destination.append(data[dump_data_start:data_start_index]) | ||
# We have a window into a line. Make sure it's not EOF | ||
if self._check_eofstack( | ||
data, data_start_index, unl_end_index): | ||
# This line is "EOF". This is the end of our dump data | ||
self._dump_destination.append( | ||
data[dump_data_start:data_start_index]) | ||
|
||
# Also push our line, since we already have it | ||
self._lines.append(data[data_start_index:unl_end_index]) | ||
|
||
# Mark dump complete | ||
self._lines.append( | ||
data[data_start_index:unl_end_index]) | ||
self._dump_destination = None | ||
#else: # This line didn't mark the end. Keep going. | ||
else: | ||
|
@@ -329,21 +318,23 @@ def _push_data_no_partial(self, data, data_start_index, unl_start_index): | |
|
||
# Update our iterators | ||
data_start_index = unl_end_index | ||
unl_start_index = BufferedSubFile._find_unl(data, data_start_index) | ||
unl_start_index = BufferedSubFile._find_unl( | ||
data, data_start_index) | ||
|
||
# If we're still dumping, push everything that isn't going into the partial to the dump | ||
if self._dump_destination is not None: | ||
# If we're able to safely flush the partial, go ahead and do that too | ||
# We don't care about self._is_dump_midline() here, because data_start_index always represents the | ||
# start of a new line, always | ||
# Push everything that isn't going into the partial to the dump | ||
# If we're able to safely flush the partial, do that too | ||
# We don't care about self._is_dump_midline() here, because | ||
# data_start_index always represents the start of a new line | ||
if self._can_dump_partial(data, data_start_index): | ||
self._dump_destination.append(data[dump_data_start:]) | ||
|
||
# We've consumed the partial; flush any partial-related state we may have set | ||
# Flush any partial-related state we may have set | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We're not really flushing the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may have set |
||
self._dangling_partial = False | ||
return # skip the _partial.append below, because it's already been consumed | ||
return # skip the _partial.append below | ||
else: | ||
self._dump_destination.append(data[dump_data_start:data_start_index]) | ||
self._dump_destination.append( | ||
data[dump_data_start:data_start_index]) | ||
|
||
# If we have any partial data leftover, go ahead and set it | ||
if data_start_index < len(data): | ||
|
@@ -381,7 +372,6 @@ def _get_dump(self, start_value:str|None = None): | |
if needs_more_data: | ||
# Flush our partial, if we can | ||
if self._partial and self._can_dump_partial(self._partial[0]): | ||
assert self._partial[0] # We shouldn't ever push empty strings to _partial | ||
_dump_destination.extend(self._partial) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should it be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're only checking the partial, we only need to check the very first character of the partial (to see if it's a The extend() here just ensures we're definitely grabbing everything, since the partial could be many parts if it's a very long partial line. |
||
self._partial.clear() | ||
self._dangling_partial = False | ||
|
@@ -402,8 +392,9 @@ def _pop_dump(self): | |
@staticmethod | ||
def _find_unl(data, start=0): | ||
# Like str.find(), but for universal newlines | ||
# Originally, this iterated over the string, however just calling find() twice is drastically faster | ||
# This could be sped up by replacing with a similar function in C, so we don't pass over the string twice. | ||
# Originally, this iterated over the string, however this is faster | ||
# This could be sped up by replacing with a similar function in C, | ||
# so we don't pass over the string twice. | ||
cr_index = data.find('\r', start) | ||
if cr_index < 0: | ||
return data.find(NL, start) | ||
|
@@ -413,9 +404,8 @@ def _find_unl(data, start=0): | |
|
||
@staticmethod | ||
def _find_unl_end(data, start): | ||
# A helper function which returns the 1-past-the-end index of a universal newline | ||
# Returns the 1-past-the-end index of a universal newline | ||
# This could be sped up by replacing with a similar function in C. | ||
#assert data[start] in '\r\n' | ||
|
||
# \n is always end of line | ||
if data.startswith(NL, start): | ||
|
@@ -425,7 +415,7 @@ def _find_unl_end(data, start): | |
if data.startswith(NL, start + 1): | ||
return start + 2 | ||
|
||
# End of string; we can't know if a \n follows, so no universal line end | ||
# End of data; we can't know if a \n follows, so no universal line end | ||
if start + 1 >= len(data): | ||
return -1 | ||
|
||
|
@@ -461,7 +451,7 @@ def __init__(self, _factory=None, *, policy=compat32): | |
self._old_style_factory = True | ||
self._input = BufferedSubFile() | ||
self._msgstack = [] | ||
self._parse = self._parsegen().__next__ # Interesting trick which replaces yield values with return values | ||
self._parse = self._parsegen().__next__ | ||
self._cur = None | ||
self._last = None | ||
self._headersonly = False | ||
|
@@ -477,7 +467,7 @@ def feed(self, data): | |
|
||
def _call_parse(self): | ||
try: | ||
self._parse() # Return value is always NeedMoreData or None, but discarded here in either case | ||
self._parse() | ||
except StopIteration: | ||
pass | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.