diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 9d80a5822af48d..c872381d962eab 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -22,6 +22,7 @@ __all__ = ['FeedParser', 'BytesFeedParser'] import re +import sys from email import errors from email._policybase import compat32 @@ -52,15 +53,16 @@ class BufferedSubFile(object): simple abstraction -- it parses until EOF closes the current message. """ def __init__(self): - # Text stream of the last partial line pushed into this object. - # See issue 22233 for why this is a text stream and not a list. - self._partial = StringIO(newline='') + self._partial = [] + self._dangling_partial = False # A deque of full, pushed lines self._lines = deque() # The stack of false-EOF checking predicates. self._eofstack = [] # A flag indicating whether the file has been closed or not. self._closed = False + self._dump_destination = None + self._dump_result = None def push_eof_matcher(self, pred): self._eofstack.append(pred) @@ -70,10 +72,8 @@ def pop_eof_matcher(self): def close(self): # Don't forget any trailing partial line. - self._partial.seek(0) - self.pushlines(self._partial.readlines()) - self._partial.seek(0) - self._partial.truncate() + if self._partial: + self._flush_partial() self._closed = True def readline(self): @@ -87,40 +87,253 @@ def readline(self): # RFC 2046, section 5.1.2 requires us to recognize outer level # boundaries at any level of inner nesting. Do this, but be sure it's # in the order of most to least nested. - for ateof in reversed(self._eofstack): - if ateof(line): - # We're at the false EOF. But push the last line back first. - self._lines.appendleft(line) - return '' + if self._check_eofstack(line): + # We're at the false EOF. But push the last line back first. + self._lines.appendleft(line) + return '' + return line + def _check_eofstack(self, data, start=0, end=sys.maxsize): + # check if we can find a dummy EOF + return any( + ateof(data, start, end) + for ateof in reversed(self._eofstack) + ) + def unreadline(self, line): # Let the consumer push a line back into the buffer. assert line is not NeedMoreData self._lines.appendleft(line) + def _flush_partial(self): + line = EMPTYSTRING.join(self._partial) + if not line: + pass + elif self._dump_destination is None: + # We're not dumping data. Just flush the partial to lines. + self._lines.append(line) + elif self._check_eofstack(line): + # We were dumping, but we've now reached the end of the dump. + self._dump_destination = None + self._lines.append(line) + else: + # We're still dumping; push to dump + self._dump_destination.append(line) + + self._partial.clear() + self._dangling_partial = False + def push(self, data): """Push some new data into this object.""" - self._partial.write(data) - if '\n' not in data and '\r' not in data: + if not data: + pass + elif self._can_dump_data(data): + self._dump_destination.append(data) + else: + self._push_data(data) + + def _can_dump_data(self, data): + if self._dump_destination is None: + return False + + # We're dumping; check for easy optimizations. + if not self._eofstack: + # There's nothing that will ever tell us to stop dumping. + # This does absolute wonders for large non-multipart emails. + assert not self._lines + assert not self._dangling_partial + assert not self._partial + return True + + # We can't dump this blob if we have pending partial data + if self._partial: + return False + + for pred in self._eofstack: + if not hasattr(pred, 'is_boundary_match'): + # We can't blindly dump entire chunks, if we're interested in + # more than just boundaries + return False + + # We only care about boundaries; we can dump as long as there's no + # potential boundaries. + return '-' not in data + + def _can_dump_partial(self, line, start=0, end=sys.maxsize): + # Very similar to _can_dump_data above, except we can make some + # additional assumptions for partials/lines. + assert not self._partial or line is self._partial[0] + + if self._dump_destination is None: + return False + + # We're dumping. There should be absolutely no other pending lines, + # because those should've been dumped. + assert not self._lines + if not self._eofstack: + # There's nothing that will ever tell us to stop dumping. Dump away + return True + + for pred in self._eofstack: + if not hasattr(pred, 'is_boundary_match'): + return False + + # We only care about boundaries; we can dump as long as there's no + # potential boundaries. + return not line.startswith("-", start, end) + + def _is_dump_midline(self): + if not self._dump_destination: + return False + return self._dump_destination[-1][-1] not in ('\n', '\r') + + def _push_data(self, data): + # Find first newline character in the data + unl_start_index = BufferedSubFile._find_unl(data) + if unl_start_index < 0: # No new complete lines, wait for more. + # Check to see if we had a previous dangling partial newline + if self._dangling_partial: + # We previously pushed a dangling line expecting \n to follow, + # however we received other data instead. Therefore, that \r + # does actually terminate a line. Go ahead and push it. + self._flush_partial() + + # No lines in data to push; wait for more data + if self._is_dump_midline(): + assert not self._partial + self._dump_destination.append(data) + else: + self._partial.append(data) return - # Crack into lines, preserving the linesep characters. - self._partial.seek(0) - parts = self._partial.readlines() - self._partial.seek(0) - self._partial.truncate() + data_start_index = 0 + # Complete our previous/partial line. + if self._partial: + if self._dangling_partial: + if data[0] != NL: + # "\r" -- push what we had, it's been terminated + self._flush_partial() + else: + # "\r\n" -- append \n to complete it and push + self._partial.append(NL) + self._flush_partial() + data_start_index = 1 - # If the last element of the list does not end in a newline, then treat - # it as a partial line. We only check for '\n' here because a line - # ending with '\r' might be a line that was split in the middle of a - # '\r\n' sequence (see bugs 1555570 and 1721862). - if not parts[-1].endswith('\n'): - self._partial.write(parts.pop()) - self.pushlines(parts) + unl_start_index = BufferedSubFile._find_unl( + data, data_start_index) + else: + # Complete our partial with the new line and push it + unl_end_index = BufferedSubFile._find_unl_end( + data, unl_start_index) + if unl_end_index < 0: + # The newline is incomplete; append data and return + self._partial.append(data) + self._dangling_partial = True + return + + # We have a complete line; append it and flush _partial + self._partial.append(data[data_start_index:unl_end_index]) + self._flush_partial() + data_start_index = unl_end_index + + # Find the next newline + unl_start_index = BufferedSubFile._find_unl( + data, data_start_index) + + # _partial is now guaranteed to point to be empty + # data_start_index is an index which points to the start of next line + # unl_start_index is the start of the next newline character, or -1 + self._push_data_no_partial(data, data_start_index, unl_start_index) + + def _push_data_no_partial(self, data, data_start_index, unl_start_index): + # Process any remaining whole lines in data + if unl_start_index < 0: + # Push right to the partial if there's no lines + if data_start_index < len(data): + assert data_start_index >= 0 + partial_line = data[data_start_index:] + if self._is_dump_midline() \ + or self._can_dump_partial(partial_line): + self._dump_destination.append(partial_line) + else: + self._partial = [partial_line] + if data[-1] == '\r': + self._dangling_partial = True + elif self._dump_destination is None \ + and unl_start_index < len(data) // 2: + # If it looks like we're going to be doing a lot of splits/joins, + # just go ahead and use StringIO, for speed + # If we had some sort of "StringViewIO" to avoid the copy, this + # would be significantly more efficient + # This code block, and the "else" code block below, functionally do + # the exact same thing, except this path makes no attempt to handle + # dumping data + sio = StringIO(data, '') + sio.seek(data_start_index) + lines = sio.readlines() + if lines: + if data[-1] != '\n': + self._partial.append(lines.pop()) + if data[-1] == '\r': + self._dangling_partial = True + self.pushlines(lines) + else: + dump_data_start = None if self._dump_destination is None \ + else data_start_index + while unl_start_index >= 0: + unl_end_index = BufferedSubFile._find_unl_end( + data, unl_start_index) + if unl_end_index < 0: + # Incomplete line ending; break to just update our partial + self._dangling_partial = True + break + + # We have an easy line; push it + if self._dump_destination is not None: + # We have a window into a line. Make sure it's not EOF + if self._check_eofstack( + data, data_start_index, unl_end_index): + # This line is "EOF". This is the end of our dump data + self._dump_destination.append( + data[dump_data_start:data_start_index]) + + # Also push our line, since we already have it + self._lines.append( + data[data_start_index:unl_end_index]) + self._dump_destination = None + #else: # This line didn't mark the end. Keep going. + else: + # We're not dumping. Just go ahead and push the line + self._lines.append(data[data_start_index:unl_end_index]) + + # Update our iterators + data_start_index = unl_end_index + unl_start_index = BufferedSubFile._find_unl( + data, data_start_index) + + if self._dump_destination is not None: + # Push everything that isn't going into the partial to the dump + # If we're able to safely flush the partial, do that too + # We don't care about self._is_dump_midline() here, because + # data_start_index always represents the start of a new line + if self._can_dump_partial(data, data_start_index): + self._dump_destination.append(data[dump_data_start:]) + + # Flush any partial-related state we may have set + self._dangling_partial = False + return # skip the _partial.append below + else: + self._dump_destination.append( + data[dump_data_start:data_start_index]) + + # If we have any partial data leftover, go ahead and set it + if data_start_index < len(data): + self._partial.append(data[data_start_index:]) def pushlines(self, lines): + # This method is not documented on docs.python.org self._lines.extend(lines) def __iter__(self): @@ -132,6 +345,71 @@ def __next__(self): raise StopIteration return line + def _get_dump(self, start_value=None): + _dump_destination = deque() + self._dump_destination = _dump_destination + + if start_value: + _dump_destination.append(start_value) + + # Flush our current _lines to _dump_destination + needs_more_data = False + for line in self: + if line is NeedMoreData: + needs_more_data = True + break + _dump_destination.append(line) + + # Pull in more data, if we need more + if needs_more_data: + # Flush our partial, if we can + if self._partial and self._can_dump_partial(self._partial[0]): + _dump_destination.extend(self._partial) + self._partial.clear() + self._dangling_partial = False + + # Pull in more data until we're told to stop + while not self._closed and self._dump_destination is not None: + yield NeedMoreData + + # Flush our final dump string to _dump_result + self._dump_destination = None + self._dump_result = EMPTYSTRING.join(_dump_destination) + + def _pop_dump(self): + result = self._dump_result + self._dump_result = None + return result + + @staticmethod + def _find_unl(data, start=0): + # Like str.find(), but for universal newlines + # Originally, this iterated over the string, however this is faster + # This could be sped up by replacing with a similar function in C, + # so we don't pass over the string twice. + cr_index = data.find('\r', start) + if cr_index < 0: + return data.find(NL, start) + nl_index = data.find(NL, start, cr_index) + return nl_index if nl_index >= 0 else cr_index + + @staticmethod + def _find_unl_end(data, start): + # Returns the 1-past-the-end index of a universal newline + # This could be sped up by replacing with a similar function in C. + + # \n is always end of line + if data.startswith(NL, start): + return start + 1 + # \r\n is always end of line + if data.startswith(NL, start + 1): + return start + 2 + # End of data; we can't know if a \n follows, so no universal line end + if start + 1 >= len(data): + return -1 + # This is a \r followed by some other non-newline character + return start + 1 + class FeedParser: """A feed-style parser of email.""" @@ -242,16 +520,8 @@ def _parsegen(self): # necessary in the older parser, which could raise errors. All # remaining lines in the input are thrown into the message body. if self._headersonly: - lines = [] - while True: - line = self._input.readline() - if line is NeedMoreData: - yield NeedMoreData - continue - if line == '': - break - lines.append(line) - self._cur.set_payload(EMPTYSTRING.join(lines)) + yield from self._input._get_dump() + self._cur.set_payload(self._input._pop_dump()) return if self._cur.get_content_type() == 'message/delivery-status': # message/delivery-status contains blocks of headers separated by @@ -311,13 +581,8 @@ def _parsegen(self): # defective. defect = errors.NoBoundaryInMultipartDefect() self.policy.handle_defect(self._cur, defect) - lines = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - lines.append(line) - self._cur.set_payload(EMPTYSTRING.join(lines)) + yield from self._input._get_dump() + self._cur.set_payload(self._input._pop_dump()) return # Make sure a valid content type was specified per RFC 2045:6.4. if (str(self._cur.get('content-transfer-encoding', '8bit')).lower() @@ -329,10 +594,11 @@ def _parsegen(self): # this onto the input stream until we've scanned past the # preamble. separator = '--' + boundary - def boundarymatch(line): - if not line.startswith(separator): + def boundarymatch(line, pos = 0, endpos = sys.maxsize): + if not line.startswith(separator, pos, endpos): return None - return boundaryendRE.match(line, len(separator)) + return boundaryendRE.match(line, pos + len(separator), endpos) + boundarymatch.is_boundary_match = True capturing_preamble = True preamble = [] linesep = False @@ -424,12 +690,11 @@ def boundarymatch(line): defect = errors.StartBoundaryNotFoundDefect() self.policy.handle_defect(self._cur, defect) self._cur.set_payload(EMPTYSTRING.join(preamble)) - epilogue = [] for line in self._input: if line is NeedMoreData: yield NeedMoreData continue - self._cur.epilogue = EMPTYSTRING.join(epilogue) + self._cur.epilogue = '' return # If we're not processing the preamble, then we might have seen # EOF without seeing that end boundary...that is also a defect. @@ -440,34 +705,27 @@ def boundarymatch(line): # Everything from here to the EOF is epilogue. If the end boundary # ended in a newline, we'll need to make sure the epilogue isn't # None - if linesep: - epilogue = [''] - else: - epilogue = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - epilogue.append(line) - # Any CRLF at the front of the epilogue is not technically part of - # the epilogue. Also, watch out for an empty string epilogue, - # which means a single newline. - if epilogue: - firstline = epilogue[0] - bolmo = NLCRE_bol.match(firstline) - if bolmo: - epilogue[0] = firstline[len(bolmo.group(0)):] - self._cur.epilogue = EMPTYSTRING.join(epilogue) + first_line = '' + if not linesep: + for line in self._input: + if line is NeedMoreData: + yield NeedMoreData + continue + + first_line = line + if first_line: + bolmo = NLCRE_bol.match(first_line) + if bolmo: + first_line = first_line[len(bolmo.group(0)):] + break + + yield from self._input._get_dump(first_line) + self._cur.epilogue = self._input._pop_dump() return # Otherwise, it's some non-multipart type, so the entire rest of the # file contents becomes the payload. - lines = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - lines.append(line) - self._cur.set_payload(EMPTYSTRING.join(lines)) + yield from self._input._get_dump() + self._cur.set_payload(self._input._pop_dump()) def _parse_headers(self, lines): # Passed a list of lines that make up the headers for the current msg diff --git a/Lib/email/parser.py b/Lib/email/parser.py index 039f03cba74fa0..2914fac09894be 100644 --- a/Lib/email/parser.py +++ b/Lib/email/parser.py @@ -12,6 +12,8 @@ from email.feedparser import FeedParser, BytesFeedParser from email._policybase import compat32 +_FEED_CHUNK_SIZE = 8192 + class Parser: def __init__(self, _class=None, *, policy=compat32): @@ -38,6 +40,18 @@ def __init__(self, _class=None, *, policy=compat32): self._class = _class self.policy = policy + def _parse_chunks(self, chunk_generator, headersonly=False): + """Internal method / implementation detail + + Parses chunks from a chunk generator into a FeedParser + """ + feedparser = FeedParser(self._class, policy=self.policy) + if headersonly: + feedparser._set_headersonly() + for data in chunk_generator: + feedparser.feed(data) + return feedparser.close() + def parse(self, fp, headersonly=False): """Create a message structure from the data in a file. @@ -46,12 +60,12 @@ def parse(self, fp, headersonly=False): parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ - feedparser = FeedParser(self._class, policy=self.policy) - if headersonly: - feedparser._set_headersonly() - while data := fp.read(8192): - feedparser.feed(data) - return feedparser.close() + def _fp_get_chunks(): + while data := fp.read(_FEED_CHUNK_SIZE): + yield data + _chunk_generator = _fp_get_chunks() + + return self._parse_chunks(_chunk_generator, headersonly) def parsestr(self, text, headersonly=False): """Create a message structure from a string. @@ -61,7 +75,12 @@ def parsestr(self, text, headersonly=False): not. The default is False, meaning it parses the entire contents of the file. """ - return self.parse(StringIO(text), headersonly=headersonly) + _chunk_generator = ( + text[offset:offset + _FEED_CHUNK_SIZE] + for offset in range(0, len(text), _FEED_CHUNK_SIZE) + ) + + return self._parse_chunks(_chunk_generator, headersonly) class HeaderParser(Parser): @@ -115,8 +134,13 @@ def parsebytes(self, text, headersonly=False): not. The default is False, meaning it parses the entire contents of the file. """ - text = text.decode('ASCII', errors='surrogateescape') - return self.parser.parsestr(text, headersonly) + _chunk_generator = ( + text[offset:offset + _FEED_CHUNK_SIZE].decode( + 'ASCII', errors='surrogateescape') + for offset in range(0, len(text), _FEED_CHUNK_SIZE) + ) + + return self.parser._parse_chunks(_chunk_generator, headersonly) class BytesHeaderParser(BytesParser): diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index b8116d073a2670..109079b37da14e 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4630,6 +4630,72 @@ def _idempotent(self, msg, data, unixfrom=False): g.flatten(msg, unixfrom=unixfrom, linesep=self.linesep) self.assertEqual(data, b.getvalue()) +class TestFeedParserTrickle(TestEmailBase): + @staticmethod + def _msgobj_trickle(filename, trickle_size=2, force_linetype="\r\n"): + # Trickle data into the feed parser, one character at a time + with openfile(filename, encoding="utf-8") as fp: + file_str = fp.read() + file_str = file_str.replace("\r\n", "\n").replace("\r", "\n") \ + .replace("\n", force_linetype) + + feedparser = FeedParser() + for index in range(0, len(file_str), trickle_size): + feedparser.feed(file_str[index:index + trickle_size]) + return feedparser.close() + + def _validate_msg10_msgobj(self, msg, line_end): + if isinstance(line_end, str): + line_end = line_end.encode() + eq = self.assertEqual + eq(msg.get_payload(decode=True), None) + eq(msg.get_payload(0).get_payload(decode=True), + b'This is a 7bit encoded message.' + line_end) + eq(msg.get_payload(1).get_payload(decode=True), + b'\xa1This is a Quoted Printable encoded message!' + line_end) + eq(msg.get_payload(2).get_payload(decode=True), + b'This is a Base64 encoded message.') + eq(msg.get_payload(3).get_payload(decode=True), + b'This is a Base64 encoded message.\n') + eq(msg.get_payload(4).get_payload(decode=True), + b'This has no Content-Transfer-Encoding: header.' + line_end) + + def test_trickle_1chr_crlf(self): + msg = self._msgobj_trickle('msg_10.txt', 1, '\r\n') + self._validate_msg10_msgobj(msg, '\r\n') + + def test_trickle_1chr_cr(self): + msg = self._msgobj_trickle('msg_10.txt', 1, '\r') + self._validate_msg10_msgobj(msg, '\r') + + def test_trickle_1chr_lf(self): + msg = self._msgobj_trickle('msg_10.txt', 1, '\n') + self._validate_msg10_msgobj(msg, '\n') + + def test_trickle_2chr_crlf(self): + msg = self._msgobj_trickle('msg_10.txt', 2, '\r\n') + self._validate_msg10_msgobj(msg, '\r\n') + + def test_trickle_2chr_cr(self): + msg = self._msgobj_trickle('msg_10.txt', 2, '\r') + self._validate_msg10_msgobj(msg, '\r') + + def test_trickle_2chr_lf(self): + msg = self._msgobj_trickle('msg_10.txt', 2, '\n') + self._validate_msg10_msgobj(msg, '\n') + + def test_trickle_3chr_crlf(self): + msg = self._msgobj_trickle('msg_10.txt', 3, '\r\n') + self._validate_msg10_msgobj(msg, '\r\n') + + def test_trickle_3chr_cr(self): + msg = self._msgobj_trickle('msg_10.txt', 3, '\r') + self._validate_msg10_msgobj(msg, '\r') + + def test_trickle_3chr_lf(self): + msg = self._msgobj_trickle('msg_10.txt', 3, '\n') + self._validate_msg10_msgobj(msg, '\n') + class TestBytesGeneratorIdempotentNL(BaseTestBytesGeneratorIdempotent, TestIdempotent): diff --git a/Misc/ACKS b/Misc/ACKS index 35826bd713c0f6..717dd8bfbb88ea 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -879,6 +879,7 @@ Jeffrey C. Jacobs Kevin Jacobs Kjetil Jacobsen Shantanu Jain +Jessica A. James Bertrand Janin Geert Jansen Jack Jansen diff --git a/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst b/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst new file mode 100644 index 00000000000000..80fd2f47385cc3 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst @@ -0,0 +1,5 @@ +Substantially improved memory usage and performance when parsing email text +in :mod:`email`. Primarily reduces memory usage in +:func:`email.message_from_bytes`, :func:`email.message_from_string`, +:class:`email.parser.Parser`, :class:`email.parser.BytesParser`, +:class:`email.parser.FeedParser`, :class:`email.parser.BytesFeedParser` pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy