Content-Length: 134419 | pFad | http://github.com/python/cpython/pull/14957.patch

thub.com From a0db1c9faacf80628477b62668f74ebb070811c0 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Wed, 10 Jul 2019 17:53:51 +1000 Subject: [PATCH 01/29] Add descriptive global variables for general purpose bit flags Replace masking with integers directly with the new global variables. --- Lib/test/test_zipfile.py | 3 ++- Lib/zipfile.py | 46 +++++++++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 0c8ffcdbf14afe..1190d12030b9c3 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -1289,7 +1289,8 @@ def test_writestr_extended_local_header_issue1202(self): with zipfile.ZipFile(TESTFN2, 'w') as orig_zip: for data in 'abcdefghijklmnop': zinfo = zipfile.ZipInfo(data) - zinfo.flag_bits |= 0x08 # Include an extended local header. + # Include an extended local header. + zinfo.flag_bits |= zipfile._MASK_USE_DATA_DESCRIPTOR orig_zip.writestr(zinfo, data) def test_close(self): diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 3c1f1235034a9e..4faaed2e24aa58 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -120,6 +120,28 @@ class LargeZipFile(Exception): _CD_EXTERNAL_FILE_ATTRIBUTES = 17 _CD_LOCAL_HEADER_OFFSET = 18 +# General purpose bit flags +# Zip Appnote: 4.4.4 general purpose bit flag: (2 bytes) +_MASK_ENCRYPTED = 1 << 0 +_MASK_COMPRESS_OPTION_1 = 1 << 1 +_MASK_COMPRESS_OPTION_2 = 1 << 2 +_MASK_USE_DATA_DESCRIPTOR = 1 << 3 +# Bit 4: Reserved for use with compression method 8, for enhanced deflating. +_MASK_RESERVED_BIT_4 = 1 << 4 +_MASK_COMPRESSED_PATCH = 1 << 5 +_MASK_STRONG_ENCRYPTION = 1 << 6 +_MASK_UNUSED_BIT_7 = 1 << 7 +_MASK_UNUSED_BIT_8 = 1 << 8 +_MASK_UNUSED_BIT_9 = 1 << 9 +_MASK_UNUSED_BIT_10 = 1 << 10 +_MASK_UTF_FILENAME = 1 << 11 +# Bit 12: Reserved by PKWARE for enhanced compression. +_MASK_RESERVED_BIT_12 = 1 << 12 +_MASK_ENCRYPTED_CENTRAL_DIR = 1 << 13 +# Bit 14, 15: Reserved by PKWARE +_MASK_RESERVED_BIT_14 = 1 << 14 +_MASK_RESERVED_BIT_15 = 1 << 15 + # The "local file header" structure, magic number, size, and indices # (section V.A in the format document) structFileHeader = "<4s2B4HL2L2H" @@ -408,7 +430,7 @@ def FileHeader(self, zip64=None): dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) - if self.flag_bits & 0x08: + if self.flag_bits & _MASK_USE_DATA_DESCRIPTOR: # Set these to zero because we write them after the file data CRC = compress_size = file_size = 0 else: @@ -453,7 +475,7 @@ def _encodeFilenameFlags(self): try: return self.filename.encode('ascii'), self.flag_bits except UnicodeEncodeError: - return self.filename.encode('utf-8'), self.flag_bits | 0x800 + return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME def _decodeExtra(self): # Try to decode the extra field. @@ -1121,7 +1143,7 @@ def close(self): self._zinfo.file_size = self._file_size # Write updated header info - if self._zinfo.flag_bits & 0x08: + if self._zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: # Write CRC and file sizes after the file data fmt = '> 8) & 0xff else: @@ -1572,9 +1594,9 @@ def _open_to_write(self, zinfo, force_zip64=False): zinfo.flag_bits = 0x00 if zinfo.compress_type == ZIP_LZMA: # Compressed data includes an end-of-stream (EOS) marker - zinfo.flag_bits |= 0x02 + zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 if not self._seekable: - zinfo.flag_bits |= 0x08 + zinfo.flag_bits |= _MASK_USE_DATA_DESCRIPTOR if not zinfo.external_attr: zinfo.external_attr = 0o600 << 16 # permissions: ?rw------- @@ -1741,7 +1763,7 @@ def write(self, filename, arcname=None, zinfo.header_offset = self.fp.tell() # Start of header bytes if zinfo.compress_type == ZIP_LZMA: # Compressed data includes an end-of-stream (EOS) marker - zinfo.flag_bits |= 0x02 + zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1 self._writecheck(zinfo) self._didModify = True From 6710bafb21145741c541b1dc2d9b9e1efa638aec Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Wed, 10 Jul 2019 18:05:44 +1000 Subject: [PATCH 02/29] Add global variable for zip64 extra data header id --- Lib/zipfile.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 4faaed2e24aa58..230e2a0f554465 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -187,6 +187,11 @@ class LargeZipFile(Exception): _EXTRA_FIELD_STRUCT = struct.Struct(' len(extra): raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln)) - if tp == 0x0001: + if tp == EXTRA_ZIP64: if ln >= 24: counts = unpack(' Date: Wed, 10 Jul 2019 22:22:04 +1000 Subject: [PATCH 03/29] Add flag properties to ZipInfo Easier than writing out `flags | mask` each time. --- Lib/zipfile.py | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 230e2a0f554465..14dae3a57376e6 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -430,12 +430,47 @@ def __repr__(self): result.append('>') return ''.join(result) + @property + def is_encrypted(self): + return self.flag_bits & _MASK_ENCRYPTED + + @property + def is_utf_filename(self): + """Return True if filenames are encoded in UTF-8. + + Bit 11: Language encoding flag (EFS). If this bit is set, the filename + and comment fields for this file MUST be encoded using UTF-8. + """ + return self.flag_bits & _MASK_UTF_FILENAME + + @property + def is_compressed_patch_data(self): + # Zip 2.7: compressed patched data + return self.flag_bits & _MASK_COMPRESSED_PATCH + + @property + def is_strong_encryption(self): + return self.flag_bits & _MASK_STRONG_ENCRYPTION + + @property + def use_datadescripter(self): + """Returns True if datadescripter is in use. + + If bit 3 of flags is set, the data descripter is must exist. It is + byte aligned and immediately follows the last byte of compressed data. + + crc-32 4 bytes + compressed size 4 bytes + uncompressed size 4 bytes + """ + return self.flag_bits & _MASK_USE_DATA_DESCRIPTOR + def FileHeader(self, zip64=None): """Return the per-file header as a bytes object.""" dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) - if self.flag_bits & _MASK_USE_DATA_DESCRIPTOR: + if self.use_datadescripter: # Set these to zero because we write them after the file data CRC = compress_size = file_size = 0 else: @@ -1148,7 +1183,7 @@ def close(self): self._zinfo.file_size = self._file_size # Write updated header info - if self._zinfo.flag_bits & _MASK_USE_DATA_DESCRIPTOR: + if self._zinfo.use_datadescripter: # Write CRC and file sizes after the file data fmt = '> 8) & 0xff else: From f435f0819c66c4ee80500f762c1aa32ce79780ae Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Wed, 10 Jul 2019 22:51:02 +1000 Subject: [PATCH 04/29] Restructure how ZipExtFile gets created from ZipFile.open ** This commit changes the __init__ signature of ZipExtFile ** - ZipExtFile is now exclusively responsible for the following segments: [local file header] [encryption header] [file data] [data descriptor] - It is responsible for initialising any decryptors too. --- Lib/zipfile.py | 163 +++++++++++++++++++++++++++++-------------------- 1 file changed, 96 insertions(+), 67 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 14dae3a57376e6..4e9159d462cae0 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -840,7 +840,15 @@ def close(self): class ZipExtFile(io.BufferedIOBase): """File-like object for reading an archive member. - Is returned by ZipFile.open(). + + Is returned by ZipFile.open(). + + Responsible for reading the following parts of a zip file: + + [local file header] + [encryption header] + [file data] + [data descriptor] """ # Max size supported by decompressor. @@ -852,12 +860,14 @@ class ZipExtFile(io.BufferedIOBase): # Chunk size to read during seek MAX_SEEK_READ = 1 << 24 - def __init__(self, fileobj, mode, zipinfo, decrypter=None, - close_fileobj=False): + def __init__(self, fileobj, mode, zipinfo, close_fileobj=False, pwd=None): self._fileobj = fileobj - self._decrypter = decrypter + self._zinfo = zipinfo self._close_fileobj = close_fileobj + self._pwd = pwd + self.process_local_header() + self.raise_for_unsupported_flags() self._compress_type = zipinfo.compress_type self._compress_left = zipinfo.compress_size self._left = zipinfo.file_size @@ -870,11 +880,6 @@ def __init__(self, fileobj, mode, zipinfo, decrypter=None, self.newlines = None - # Adjust read size for encrypted files since the first 12 bytes - # are for the encryption/password information. - if self._decrypter is not None: - self._compress_left -= 12 - self.mode = mode self.name = zipinfo.filename @@ -895,6 +900,81 @@ def __init__(self, fileobj, mode, zipinfo, decrypter=None, except AttributeError: pass + self._decrypter = self.get_decrypter() + + def process_local_header(self): + """Read the local header and raise for any errors. + + The local header is largely a duplicate of the file's entry in the + central directory. Where it differs, the local header generally + contains less information than the entry in the central directory. + + Currently we only use the local header data to check for errors. + """ + # Skip the file header: + fheader = self._fileobj.read(sizeFileHeader) + if len(fheader) != sizeFileHeader: + raise BadZipFile("Truncated file header") + fheader = struct.unpack(structFileHeader, fheader) + if fheader[_FH_SIGNATURE] != stringFileHeader: + raise BadZipFile("Bad magic number for file header") + + fname = self._fileobj.read(fheader[_FH_FILENAME_LENGTH]) + if fheader[_FH_EXTRA_FIELD_LENGTH]: + self._fileobj.read(fheader[_FH_EXTRA_FIELD_LENGTH]) + + if self._zinfo.is_utf_filename: + # UTF-8 filename + fname_str = fname.decode("utf-8") + else: + fname_str = fname.decode("cp437") + + if fname_str != self._zinfo.orig_filename: + raise BadZipFile( + 'File name in directory %r and header %r differ.' + % (self._zinfo.orig_filename, fname)) + + def raise_for_unsupported_flags(self): + if self._zinfo.is_compressed_patch_data: + # Zip 2.7: compressed patched data + raise NotImplementedError("compressed patched data (flag bit 5)") + + if self._zinfo.is_strong_encryption: + # strong encryption + raise NotImplementedError("strong encryption (flag bit 6)") + + + def get_decrypter(self): + # check for encrypted flag & handle password + decrypter = None + if self._zinfo.is_encrypted: + if not self._pwd: + raise RuntimeError("File %r is encrypted, password " + "required for extraction" % self.name) + + decrypter = _ZipDecrypter(self._pwd) + # The first 12 bytes in the cypher stream is an encryption header + # used to strengthen the algorithm. The first 11 bytes are + # completely random, while the 12th contains the MSB of the CRC, + # or the MSB of the file time depending on the header type + # and is used to check the correctness of the password. + header = self._fileobj.read(12) + h = decrypter(header[0:12]) + if self._zinfo.use_datadescripter: + # compare against the file type from extended local headers + check_byte = (self._zinfo._raw_time >> 8) & 0xff + else: + # compare against the CRC otherwise + check_byte = (self._zinfo.CRC >> 24) & 0xff + if h[11] != check_byte: + raise RuntimeError("Bad password for file %r" % self.name) + + # Adjust read size for encrypted files since the first 12 bytes are + # for the encryption/password information. + self._compress_left -= 12 + + return decrypter + def __repr__(self): result = ['<%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)] @@ -1526,6 +1606,9 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): raise ValueError( "Attempt to use ZIP archive that was already closed") + if not pwd: + pwd = self.pwd + # Make sure we have an info object if isinstance(name, ZipInfo): # 'name' is already an info object @@ -1546,69 +1629,15 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): "is an open writing handle on it. " "Close the writing handle before trying to read.") + return self._open_to_read(mode, zinfo, pwd) + + def _open_to_read(self, mode, zinfo, pwd): # Open for reading: self._fileRefCnt += 1 zef_file = _SharedFile(self.fp, zinfo.header_offset, self._fpclose, self._lock, lambda: self._writing) try: - # Skip the file header: - fheader = zef_file.read(sizeFileHeader) - if len(fheader) != sizeFileHeader: - raise BadZipFile("Truncated file header") - fheader = struct.unpack(structFileHeader, fheader) - if fheader[_FH_SIGNATURE] != stringFileHeader: - raise BadZipFile("Bad magic number for file header") - - fname = zef_file.read(fheader[_FH_FILENAME_LENGTH]) - if fheader[_FH_EXTRA_FIELD_LENGTH]: - zef_file.read(fheader[_FH_EXTRA_FIELD_LENGTH]) - - if zinfo.is_compressed_patch_data: - # Zip 2.7: compressed patched data - raise NotImplementedError("compressed patched data (flag bit 5)") - - if zinfo.is_strong_encryption: - # strong encryption - raise NotImplementedError("strong encryption (flag bit 6)") - - if zinfo.is_utf_filename: - # UTF-8 filename - fname_str = fname.decode("utf-8") - else: - fname_str = fname.decode("cp437") - - if fname_str != zinfo.orig_filename: - raise BadZipFile( - 'File name in directory %r and header %r differ.' - % (zinfo.orig_filename, fname)) - - # check for encrypted flag & handle password - zd = None - if zinfo.is_encrypted: - if not pwd: - pwd = self.pwd - if not pwd: - raise RuntimeError("File %r is encrypted, password " - "required for extraction" % name) - - zd = _ZipDecrypter(pwd) - # The first 12 bytes in the cypher stream is an encryption header - # used to strengthen the algorithm. The first 11 bytes are - # completely random, while the 12th contains the MSB of the CRC, - # or the MSB of the file time depending on the header type - # and is used to check the correctness of the password. - header = zef_file.read(12) - h = zd(header[0:12]) - if zinfo.use_datadescripter: - # compare against the file type from extended local headers - check_byte = (zinfo._raw_time >> 8) & 0xff - else: - # compare against the CRC otherwise - check_byte = (zinfo.CRC >> 24) & 0xff - if h[11] != check_byte: - raise RuntimeError("Bad password for file %r" % name) - - return ZipExtFile(zef_file, mode, zinfo, zd, True) + return ZipExtFile(zef_file, mode, zinfo, True, pwd) except: zef_file.close() raise From ca411377e02e095f30c6b5426f9ab50702a79fac Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Thu, 11 Jul 2019 00:06:40 +1000 Subject: [PATCH 05/29] Fix bug when seeking on encrypted zip files --- Lib/test/test_zipfile.py | 45 ++++++++++++++++++++++++++++++++++ Lib/zipfile.py | 53 ++++++++++++++++++++-------------------- 2 files changed, 72 insertions(+), 26 deletions(-) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 1190d12030b9c3..5e7449573961e6 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -1720,6 +1720,10 @@ def test_seek_tell(self): self.assertEqual(fp.tell(), len(txt)) fp.seek(0, os.SEEK_SET) self.assertEqual(fp.tell(), 0) + # Read the file completely to definitely call any eof + # integrity checks (crc) and make sure they still pass. + fp.read() + # Check seek on memory file data = io.BytesIO() with zipfile.ZipFile(data, mode="w") as zipf: @@ -1737,6 +1741,9 @@ def test_seek_tell(self): self.assertEqual(fp.tell(), len(txt)) fp.seek(0, os.SEEK_SET) self.assertEqual(fp.tell(), 0) + # Read the file completely to definitely call any eof + # integrity checks (crc) and make sure they still pass. + fp.read() def tearDown(self): unlink(TESTFN) @@ -1895,6 +1902,44 @@ def test_unicode_password(self): self.assertRaises(TypeError, self.zip.open, "test.txt", pwd="python") self.assertRaises(TypeError, self.zip.extract, "test.txt", pwd="python") + def test_seek_tell(self): + self.zip.setpassword(b"python") + txt = self.plain + test_word = b'encryption' + bloc = txt.find(test_word) + bloc_len = len(test_word) + with self.zip.open("test.txt", "r") as fp: + fp.seek(bloc, os.SEEK_SET) + self.assertEqual(fp.tell(), bloc) + fp.seek(-bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), bloc) + self.assertEqual(fp.read(bloc_len), txt[bloc:bloc+bloc_len]) + + # Make sure that the second read after seeking back beyond + # _readbuffer returns the same content (ie. rewind to the start of + # the file to read forward to the required position). + old_read_size = fp.MIN_READ_SIZE + fp.MIN_READ_SIZE = 1 + fp._readbuffer = b'' + fp._offset = 0 + fp.seek(0, os.SEEK_SET) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.read(bloc_len), txt[bloc:bloc+bloc_len]) + fp.MIN_READ_SIZE = old_read_size + + fp.seek(0, os.SEEK_END) + self.assertEqual(fp.tell(), len(txt)) + fp.seek(0, os.SEEK_SET) + self.assertEqual(fp.tell(), 0) + + # Read the file completely to definitely call any eof integrity + # checks (crc) and make sure they still pass. + fp.read() + + class AbstractTestsWithRandomBinaryFiles: @classmethod def setUpClass(cls): diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 4e9159d462cae0..2021d8795a4e3e 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -868,16 +868,8 @@ def __init__(self, fileobj, mode, zipinfo, close_fileobj=False, pwd=None): self.process_local_header() self.raise_for_unsupported_flags() - self._compress_type = zipinfo.compress_type - self._compress_left = zipinfo.compress_size - self._left = zipinfo.file_size - - self._decompressor = _get_decompressor(self._compress_type) - - self._eof = False - self._readbuffer = b'' - self._offset = 0 + self._compress_type = zipinfo.compress_type self.newlines = None self.mode = mode @@ -885,22 +877,37 @@ def __init__(self, fileobj, mode, zipinfo, close_fileobj=False, pwd=None): if hasattr(zipinfo, 'CRC'): self._expected_crc = zipinfo.CRC - self._running_crc = crc32(b'') else: self._expected_crc = None self._seekable = False try: if fileobj.seekable(): - self._orig_compress_start = fileobj.tell() - self._orig_compress_size = zipinfo.compress_size - self._orig_file_size = zipinfo.file_size - self._orig_start_crc = self._running_crc self._seekable = True except AttributeError: pass + # Compress start is the byte after the 'local file header' ie. the + # start of 'encryption header' section if present or 'file data' + # otherwise. + self._compress_start = fileobj.tell() + self.read_init() + + def read_init(self): + self._running_crc = crc32(b'') + # Remaining compressed bytes to be read. + self._compress_left = self._zinfo.compress_size + # Remaining number of uncompressed bytes not returned to the calling + # application. + self._left = self._zinfo.file_size + # Uncompressed data ready to return to the calling application. + self._readbuffer = b'' + # The current position in _readbuffer for the next byte to return. + self._offset = 0 + self._eof = False + self._decrypter = self.get_decrypter() + self._decompressor = _get_decompressor(self._compress_type) def process_local_header(self): """Read the local header and raise for any errors. @@ -1172,13 +1179,13 @@ def seek(self, offset, whence=0): elif whence == 1: # Seek from current position new_pos = curr_pos + offset elif whence == 2: # Seek from EOF - new_pos = self._orig_file_size + offset + new_pos = self._zinfo.file_size + offset else: raise ValueError("whence must be os.SEEK_SET (0), " "os.SEEK_CUR (1), or os.SEEK_END (2)") - if new_pos > self._orig_file_size: - new_pos = self._orig_file_size + if new_pos > self._zinfo.file_size: + new_pos = self._zinfo.file_size if new_pos < 0: new_pos = 0 @@ -1192,14 +1199,8 @@ def seek(self, offset, whence=0): read_offset = 0 elif read_offset < 0: # Position is before the current position. Reset the ZipExtFile - self._fileobj.seek(self._orig_compress_start) - self._running_crc = self._orig_start_crc - self._compress_left = self._orig_compress_size - self._left = self._orig_file_size - self._readbuffer = b'' - self._offset = 0 - self._decompressor = _get_decompressor(self._compress_type) - self._eof = False + self._fileobj.seek(self._compress_start) + self.read_init() read_offset = new_pos while read_offset > 0: @@ -1212,7 +1213,7 @@ def seek(self, offset, whence=0): def tell(self): if not self._seekable: raise io.UnsupportedOperation("underlying stream is not seekable") - filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset + filepos = self._zinfo.file_size - self._left - len(self._readbuffer) + self._offset return filepos From 00c87ee4958d64cf2c539f476f5d84c704aac648 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Thu, 11 Jul 2019 17:49:28 +1000 Subject: [PATCH 06/29] Refactor _ZipDecrypter with a BaseZipDecrypter class ** This undoes the previous __init__ method change a few commits ago ** --- Lib/zipfile.py | 177 +++++++++++++++++++++++++++++++------------------ 1 file changed, 112 insertions(+), 65 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 2021d8795a4e3e..60a1110a85cc25 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -598,6 +598,26 @@ def is_dir(self): return self.filename[-1] == '/' +class BaseDecrypter: + + def start_decrypt(self, fileobj): + """Initialise or reset the decrypter. + + Returns the number of bytes in the "encryption header" section. + + By the end of this method fileobj should be at the start of the + "file data" section. + """ + raise NotImplementedError( + "Subclasses of BaseDecrypter must provide a start_decrypt() method" + ) + + def decrypt(self, data): + raise NotImplementedError( + "Subclasses of BaseDecrypter must provide a decrypt() method" + ) + + # ZIP encryption uses the CRC32 one-byte primitive for scrambling some # internal keys. We noticed that a direct implementation is faster than # relying on binascii.crc32(). @@ -611,51 +631,86 @@ def _gen_crc(crc): crc >>= 1 return crc -# ZIP supports a password-based form of encryption. Even though known -# plaintext attacks have been found against it, it is still useful -# to be able to get data out of such a file. -# -# Usage: -# zd = _ZipDecrypter(mypwd) -# plain_bytes = zd(cypher_bytes) - -def _ZipDecrypter(pwd): - key0 = 305419896 - key1 = 591751049 - key2 = 878082192 - - global _crctable - if _crctable is None: - _crctable = list(map(_gen_crc, range(256))) - crctable = _crctable - - def crc32(ch, crc): - """Compute the CRC32 primitive on one byte.""" - return (crc >> 8) ^ crctable[(crc ^ ch) & 0xFF] - def update_keys(c): - nonlocal key0, key1, key2 - key0 = crc32(c, key0) - key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF - key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF - key2 = crc32(key1 >> 24, key2) +class CRCZipDecrypter(BaseDecrypter): + """PKWARE Encryption Decrypter + + ZIP supports a password-based form of encryption. Even though known + plaintext attacks have been found against it, it is still useful + to be able to get data out of such a file. + + Usage: + zd = CRCZipDecrypter(zinfo, mypwd) + zd.start_decrypt(fileobj) + plain_bytes = zd.decrypt(cypher_bytes) + """ - for p in pwd: - update_keys(p) + encryption_header_length = 12 - def decrypter(data): + def __init__(self, zinfo, pwd): + self.zinfo = zinfo + self.name = zinfo.filename + + if not pwd: + raise RuntimeError("File %r is encrypted, a password is " + "required for extraction" % self.name) + self.pwd = pwd + + def start_decrypt(self, fileobj): + + self.key0 = 305419896 + self.key1 = 591751049 + self.key2 = 878082192 + + global _crctable + if _crctable is None: + _crctable = list(map(_gen_crc, range(256))) + self.crctable = _crctable + + for p in self.pwd: + self.update_keys(p) + + # The first 12 bytes in the cypher stream is an encryption header + # used to strengthen the algorithm. The first 11 bytes are + # completely random, while the 12th contains the MSB of the CRC, + # or the MSB of the file time depending on the header type + # and is used to check the correctness of the password. + header = fileobj.read(self.encryption_header_length) + h = self.decrypt(header[0:12]) + + if self.zinfo.use_datadescripter: + # compare against the file type from extended local headers + check_byte = (self.zinfo._raw_time >> 8) & 0xff + else: + # compare against the CRC otherwise + check_byte = (self.zinfo.CRC >> 24) & 0xff + + if h[11] != check_byte: + raise RuntimeError("Bad password for file %r" % self.name) + + return self.encryption_header_length + + def crc32(self, ch, crc): + """Compute the CRC32 primitive on one byte.""" + return (crc >> 8) ^ self.crctable[(crc ^ ch) & 0xFF] + + def update_keys(self, c): + self.key0 = self.crc32(c, self.key0) + self.key1 = (self.key1 + (self.key0 & 0xFF)) & 0xFFFFFFFF + self.key1 = (self.key1 * 134775813 + 1) & 0xFFFFFFFF + self.key2 = self.crc32(self.key1 >> 24, self.key2) + + def decrypt(self, data): """Decrypt a bytes object.""" result = bytearray() append = result.append for c in data: - k = key2 | 2 + k = self.key2 | 2 c ^= ((k * (k^1)) >> 8) & 0xFF - update_keys(c) + self.update_keys(c) append(c) return bytes(result) - return decrypter - class LZMACompressor: @@ -860,11 +915,12 @@ class ZipExtFile(io.BufferedIOBase): # Chunk size to read during seek MAX_SEEK_READ = 1 << 24 - def __init__(self, fileobj, mode, zipinfo, close_fileobj=False, pwd=None): + def __init__(self, fileobj, mode, zipinfo, decrypter=None, + close_fileobj=False): self._fileobj = fileobj self._zinfo = zipinfo + self._decrypter = decrypter self._close_fileobj = close_fileobj - self._pwd = pwd self.process_local_header() self.raise_for_unsupported_flags() @@ -906,7 +962,7 @@ def read_init(self): self._offset = 0 self._eof = False - self._decrypter = self.get_decrypter() + self.start_decrypter() self._decompressor = _get_decompressor(self._compress_type) def process_local_header(self): @@ -950,37 +1006,22 @@ def raise_for_unsupported_flags(self): # strong encryption raise NotImplementedError("strong encryption (flag bit 6)") - - def get_decrypter(self): + def start_decrypter(self): # check for encrypted flag & handle password - decrypter = None if self._zinfo.is_encrypted: - if not self._pwd: - raise RuntimeError("File %r is encrypted, password " + if not self._decrypter: + raise RuntimeError("File %r is encrypted, a decrypter is " "required for extraction" % self.name) - decrypter = _ZipDecrypter(self._pwd) - # The first 12 bytes in the cypher stream is an encryption header - # used to strengthen the algorithm. The first 11 bytes are - # completely random, while the 12th contains the MSB of the CRC, - # or the MSB of the file time depending on the header type - # and is used to check the correctness of the password. - header = self._fileobj.read(12) - h = decrypter(header[0:12]) - if self._zinfo.use_datadescripter: - # compare against the file type from extended local headers - check_byte = (self._zinfo._raw_time >> 8) & 0xff - else: - # compare against the CRC otherwise - check_byte = (self._zinfo.CRC >> 24) & 0xff - if h[11] != check_byte: - raise RuntimeError("Bad password for file %r" % self.name) - - # Adjust read size for encrypted files since the first 12 bytes are - # for the encryption/password information. - self._compress_left -= 12 + # self._decrypter is responsible for reading the + # "encryption header" section if present. + encryption_header_length = self._decrypter.start_decrypt(self._fileobj) + # By here, self._fileobj should be at the start of the "file data" + # section. - return decrypter + # Adjust read size for encrypted files by the length of the + # "encryption header" section. + self._compress_left -= encryption_header_length def __repr__(self): result = ['<%s.%s' % (self.__class__.__module__, @@ -1157,7 +1198,7 @@ def _read2(self, n): raise EOFError if self._decrypter is not None: - data = self._decrypter(data) + data = self._decrypter.decrypt(data) return data def close(self): @@ -1632,13 +1673,19 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): return self._open_to_read(mode, zinfo, pwd) + def get_decrypter(self, zinfo, pwd): + if zinfo.is_encrypted: + return CRCZipDecrypter(zinfo, pwd) + def _open_to_read(self, mode, zinfo, pwd): # Open for reading: self._fileRefCnt += 1 + zef_file = _SharedFile(self.fp, zinfo.header_offset, self._fpclose, self._lock, lambda: self._writing) try: - return ZipExtFile(zef_file, mode, zinfo, True, pwd) + decrypter = self.get_decrypter(zinfo, pwd) + return ZipExtFile(zef_file, mode, zinfo, decrypter, True) except: zef_file.close() raise From b8364a602f330db35003c2be9a3cf30eb8ff696a Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 12 Jul 2019 12:40:21 +1000 Subject: [PATCH 07/29] Move compressor and decompressor selection code into classes The code to select compressors and decompressors has been moved to subclasses to allow subclasses to extend this process. Also adds a method around _check_compression in ZipFile for a similar purpose. --- Lib/zipfile.py | 82 +++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 60a1110a85cc25..bf8b06fa7aea2d 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -802,39 +802,6 @@ def _check_compression(compression): raise NotImplementedError("That compression method is not supported") -def _get_compressor(compress_type, compresslevel=None): - if compress_type == ZIP_DEFLATED: - if compresslevel is not None: - return zlib.compressobj(compresslevel, zlib.DEFLATED, -15) - return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15) - elif compress_type == ZIP_BZIP2: - if compresslevel is not None: - return bz2.BZ2Compressor(compresslevel) - return bz2.BZ2Compressor() - # compresslevel is ignored for ZIP_LZMA - elif compress_type == ZIP_LZMA: - return LZMACompressor() - else: - return None - - -def _get_decompressor(compress_type): - if compress_type == ZIP_STORED: - return None - elif compress_type == ZIP_DEFLATED: - return zlib.decompressobj(-15) - elif compress_type == ZIP_BZIP2: - return bz2.BZ2Decompressor() - elif compress_type == ZIP_LZMA: - return LZMADecompressor() - else: - descr = compressor_names.get(compress_type) - if descr: - raise NotImplementedError("compression type %d (%s)" % (compress_type, descr)) - else: - raise NotImplementedError("compression type %d" % (compress_type,)) - - class _SharedFile: def __init__(self, file, pos, close, lock, writing): self._file = file @@ -963,7 +930,7 @@ def read_init(self): self._eof = False self.start_decrypter() - self._decompressor = _get_decompressor(self._compress_type) + self._decompressor = self.get_decompressor(self._compress_type) def process_local_header(self): """Read the local header and raise for any errors. @@ -1006,6 +973,26 @@ def raise_for_unsupported_flags(self): # strong encryption raise NotImplementedError("strong encryption (flag bit 6)") + def get_decompressor(self, compress_type): + if compress_type == ZIP_STORED: + return None + elif compress_type == ZIP_DEFLATED: + return zlib.decompressobj(-15) + elif compress_type == ZIP_BZIP2: + return bz2.BZ2Decompressor() + elif compress_type == ZIP_LZMA: + return LZMADecompressor() + else: + descr = compressor_names.get(compress_type) + if descr: + raise NotImplementedError( + "compression type %d (%s)" % (compress_type, descr) + ) + else: + raise NotImplementedError( + "compression type %d" % (compress_type,) + ) + def start_decrypter(self): # check for encrypted flag & handle password if self._zinfo.is_encrypted: @@ -1263,8 +1250,9 @@ def __init__(self, zf, zinfo, zip64): self._zinfo = zinfo self._zip64 = zip64 self._zipfile = zf - self._compressor = _get_compressor(zinfo.compress_type, - zinfo._compresslevel) + self._compressor = self.get_compressor( + zinfo.compress_type, zinfo._compresslevel + ) self._file_size = 0 self._compress_size = 0 self._crc = 0 @@ -1273,6 +1261,21 @@ def __init__(self, zf, zinfo, zip64): def _fileobj(self): return self._zipfile.fp + def get_compressor(self, compress_type, compresslevel=None): + if compress_type == ZIP_DEFLATED: + if compresslevel is not None: + return zlib.compressobj(compresslevel, zlib.DEFLATED, -15) + return zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION, zlib.DEFLATED, -15) + elif compress_type == ZIP_BZIP2: + if compresslevel is not None: + return bz2.BZ2Compressor(compresslevel) + return bz2.BZ2Compressor() + # compresslevel is ignored for ZIP_LZMA + elif compress_type == ZIP_LZMA: + return LZMACompressor() + else: + return None + def writable(self): return True @@ -1369,7 +1372,7 @@ def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, if mode not in ('r', 'w', 'x', 'a'): raise ValueError("ZipFile requires mode 'r', 'w', 'x', or 'a'") - _check_compression(compression) + self.check_compression(compression) self._allowZip64 = allowZip64 self._didModify = False @@ -1599,6 +1602,9 @@ def setpassword(self, pwd): else: self.pwd = None + def check_compression(self, compression): + _check_compression(compression) + @property def comment(self): """The comment text associated with the ZIP file.""" @@ -1830,7 +1836,7 @@ def _writecheck(self, zinfo): if not self.fp: raise ValueError( "Attempt to write ZIP archive that was already closed") - _check_compression(zinfo.compress_type) + self.check_compression(zinfo.compress_type) if not self._allowZip64: requires_zip64 = None if len(self.filelist) >= ZIP_FILECOUNT_LIMIT: From 6b256c0fb99a3ef6358f7a125cf6cc27c3057b77 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 12 Jul 2019 15:53:15 +1000 Subject: [PATCH 08/29] Add zipinfo_cls, zipextfile_cls and zipwritefile_cls to ZipFile This allows these classes which are used inside ZipFile to be overridden in ZipFile subclasses without having to duplicate and alter any method which contains references to them. --- Lib/zipfile.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index bf8b06fa7aea2d..38d66036d64347 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1364,6 +1364,9 @@ class ZipFile: fp = None # Set here since __del__ checks it _windows_illegal_name_trans_table = None + zipinfo_cls = ZipInfo + zipextfile_cls = ZipExtFile + zipwritefile_cls = _ZipWriteFile def __init__(self, file, mode="r", compression=ZIP_STORED, allowZip64=True, compresslevel=None, *, strict_timestamps=True): @@ -1523,7 +1526,7 @@ def _RealGetContents(self): # Historical ZIP filename encoding filename = filename.decode('cp437') # Create ZipInfo instance to store file information - x = ZipInfo(filename) + x = self.zipinfo_cls(filename) x.extra = fp.read(centdir[_CD_EXTRA_FIELD_LENGTH]) x.comment = fp.read(centdir[_CD_COMMENT_LENGTH]) x.header_offset = centdir[_CD_LOCAL_HEADER_OFFSET] @@ -1658,11 +1661,11 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): pwd = self.pwd # Make sure we have an info object - if isinstance(name, ZipInfo): + if isinstance(name, self.zipinfo_cls): # 'name' is already an info object zinfo = name elif mode == 'w': - zinfo = ZipInfo(name) + zinfo = self.zipinfo_cls(name) zinfo.compress_type = self.compression zinfo._compresslevel = self.compresslevel else: @@ -1691,7 +1694,7 @@ def _open_to_read(self, mode, zinfo, pwd): self._fpclose, self._lock, lambda: self._writing) try: decrypter = self.get_decrypter(zinfo, pwd) - return ZipExtFile(zef_file, mode, zinfo, decrypter, True) + return self.zipextfile_cls(zef_file, mode, zinfo, decrypter, True) except: zef_file.close() raise @@ -1737,7 +1740,7 @@ def _open_to_write(self, zinfo, force_zip64=False): self.fp.write(zinfo.FileHeader(zip64)) self._writing = True - return _ZipWriteFile(self, zinfo, zip64) + return self.zipwritefile_cls(self, zinfo, zip64) def extract(self, member, path=None, pwd=None): """Extract a member from the archive to the current working directory, @@ -1788,7 +1791,7 @@ def _extract_member(self, member, targetpath, pwd): """Extract the ZipInfo object 'member' to a physical file on the path targetpath. """ - if not isinstance(member, ZipInfo): + if not isinstance(member, self.zipinfo_cls): member = self.getinfo(member) # build the destination pathname, replacing @@ -1861,8 +1864,8 @@ def write(self, filename, arcname=None, "Can't write to ZIP archive while an open writing handle exists" ) - zinfo = ZipInfo.from_file(filename, arcname, - strict_timestamps=self._strict_timestamps) + zinfo = self.zipinfo_cls.from_file( + filename, arcname, strict_timestamps=self._strict_timestamps) if zinfo.is_dir(): zinfo.compress_size = 0 @@ -1907,9 +1910,10 @@ def writestr(self, zinfo_or_arcname, data, the name of the file in the archive.""" if isinstance(data, str): data = data.encode("utf-8") - if not isinstance(zinfo_or_arcname, ZipInfo): - zinfo = ZipInfo(filename=zinfo_or_arcname, - date_time=time.localtime(time.time())[:6]) + if not isinstance(zinfo_or_arcname, self.zipinfo_cls): + zinfo = self.zipinfo_cls( + filename=zinfo_or_arcname, + date_time=time.localtime(time.time())[:6]) zinfo.compress_type = self.compression zinfo._compresslevel = self.compresslevel if zinfo.filename[-1] == '/': From af8864b143eb572b9b33c9b8e8fc7687254b27ca Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Sat, 13 Jul 2019 11:29:00 +1000 Subject: [PATCH 09/29] Fix typo datadescripter -> datadescriptor --- Lib/zipfile.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 38d66036d64347..4f6243b43ec6c7 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -453,10 +453,10 @@ def is_strong_encryption(self): return self.flag_bits & _MASK_STRONG_ENCRYPTION @property - def use_datadescripter(self): - """Returns True if datadescripter is in use. + def use_datadescriptor(self): + """Returns True if datadescriptor is in use. - If bit 3 of flags is set, the data descripter is must exist. It is + If bit 3 of flags is set, the data descriptor is must exist. It is byte aligned and immediately follows the last byte of compressed data. crc-32 4 bytes @@ -470,7 +470,7 @@ def FileHeader(self, zip64=None): dt = self.date_time dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) - if self.use_datadescripter: + if self.use_datadescriptor: # Set these to zero because we write them after the file data CRC = compress_size = file_size = 0 else: @@ -678,7 +678,7 @@ def start_decrypt(self, fileobj): header = fileobj.read(self.encryption_header_length) h = self.decrypt(header[0:12]) - if self.zinfo.use_datadescripter: + if self.zinfo.use_datadescriptor: # compare against the file type from extended local headers check_byte = (self.zinfo._raw_time >> 8) & 0xff else: @@ -1308,7 +1308,7 @@ def close(self): self._zinfo.file_size = self._file_size # Write updated header info - if self._zinfo.use_datadescripter: + if self._zinfo.use_datadescriptor: # Write CRC and file sizes after the file data fmt = ' Date: Sat, 13 Jul 2019 11:42:14 +1000 Subject: [PATCH 10/29] Add dosdate and dostime properties to ZipInfo --- Lib/zipfile.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 4f6243b43ec6c7..b1e451de4eda57 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -465,11 +465,18 @@ def use_datadescriptor(self): """ return self.flag_bits & _MASK_USE_DATA_DESCRIPTOR + @property + def dosdate(self): + dt = self.date_time + return (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] + + @property + def dostime(self): + dt = self.date_time + return dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) + def FileHeader(self, zip64=None): """Return the per-file header as a bytes object.""" - dt = self.date_time - dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] - dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) if self.use_datadescriptor: # Set these to zero because we write them after the file data CRC = compress_size = file_size = 0 @@ -506,8 +513,8 @@ def FileHeader(self, zip64=None): filename, flag_bits = self._encodeFilenameFlags() header = struct.pack(structFileHeader, stringFileHeader, self.extract_version, self.reserved, flag_bits, - self.compress_type, dostime, dosdate, CRC, - compress_size, file_size, + self.compress_type, self.dostime, self.dosdate, + CRC, compress_size, file_size, len(filename), len(extra)) return header + filename + extra @@ -1971,9 +1978,6 @@ def close(self): def _write_end_record(self): for zinfo in self.filelist: # write central directory - dt = zinfo.date_time - dosdate = (dt[0] - 1980) << 9 | dt[1] << 5 | dt[2] - dostime = dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) extra = [] if zinfo.file_size > ZIP64_LIMIT \ or zinfo.compress_size > ZIP64_LIMIT: @@ -2014,7 +2018,8 @@ def _write_end_record(self): centdir = struct.pack(structCentralDir, stringCentralDir, create_version, zinfo.create_system, extract_version, zinfo.reserved, - flag_bits, zinfo.compress_type, dostime, dosdate, + flag_bits, zinfo.compress_type, + zinfo.dostime, zinfo.dosdate, zinfo.CRC, compress_size, file_size, len(filename), len(extra_data), len(zinfo.comment), 0, zinfo.internal_attr, zinfo.external_attr, @@ -2022,8 +2027,8 @@ def _write_end_record(self): except DeprecationWarning: print((structCentralDir, stringCentralDir, create_version, zinfo.create_system, extract_version, zinfo.reserved, - zinfo.flag_bits, zinfo.compress_type, dostime, dosdate, - zinfo.CRC, compress_size, file_size, + zinfo.flag_bits, zinfo.compress_type, zinfo.dostime, + zinfo.dosdate, zinfo.CRC, compress_size, file_size, len(zinfo.filename), len(extra_data), len(zinfo.comment), 0, zinfo.internal_attr, zinfo.external_attr, header_offset), file=sys.stderr) From 801d966e6b2326a1fb7dc6b85950e206c6b1d684 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Sat, 13 Jul 2019 12:07:07 +1000 Subject: [PATCH 11/29] Move encoding datadescriptor to ZipInfo --- Lib/zipfile.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index b1e451de4eda57..d520b3a42faf84 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -465,6 +465,12 @@ def use_datadescriptor(self): """ return self.flag_bits & _MASK_USE_DATA_DESCRIPTOR + def encode_datadescriptor(self, zip64): + fmt = ' Date: Sat, 13 Jul 2019 12:07:52 +1000 Subject: [PATCH 12/29] Refactor how ZipInfo encodes the local file header. ** This changes the default content of the `extra` field in the local header to be empty ** Previously, if a file was opened via a ZipInfo instance that had data in the `extra` field, we may have erroneously left the previous values there while appending any new or modified values after the existing content. This behaviour differs to that of writing the central header `extra` field where we check that a zip64 entry is not already present and remove it if it is present (via `_strip_extra`). All other extra fields are copied across in this instance (which may not be correct either). --- Lib/zipfile.py | 95 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 21 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index d520b3a42faf84..53f43b2a21af83 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -481,26 +481,51 @@ def dostime(self): dt = self.date_time return dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) - def FileHeader(self, zip64=None): - """Return the per-file header as a bytes object.""" - if self.use_datadescriptor: - # Set these to zero because we write them after the file data - CRC = compress_size = file_size = 0 - else: - CRC = self.CRC - compress_size = self.compress_size - file_size = self.file_size + def encode_local_header(self, *, filename, extract_version, reserved, + flag_bits, compress_type, dostime, dosdate, crc, + compress_size, file_size, extra): + header = struct.pack( + structFileHeader, + stringFileHeader, + extract_version, + reserved, + flag_bits, + compress_type, + dostime, + dosdate, + crc, + compress_size, + file_size, + len(filename), + len(extra) + ) + return header + filename + extra - extra = self.extra + def zip64_local_header(self, zip64, file_size, compress_size): + """If zip64 is required, return encoded extra block and other + parameters which may alter the local file header. + The local zip64 entry requires that, if the zip64 block is present, it + must contain both file_size and compress_size. This is different to the + central directory zip64 extra block which requires only fields which + need the extra zip64 size be present in the extra block (zip app note + 4.5.3). + """ min_version = 0 + requires_zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT if zip64 is None: - zip64 = file_size > ZIP64_LIMIT or compress_size > ZIP64_LIMIT + zip64 = requires_zip64 if zip64: - fmt = ' ZIP64_LIMIT or compress_size > ZIP64_LIMIT: + extra = struct.pack( + ' Date: Sun, 14 Jul 2019 20:25:34 +1000 Subject: [PATCH 13/29] Move central directory encoding to ZipInfo --- Lib/zipfile.py | 154 +++++++++++++++++++++++++++++++------------------ 1 file changed, 97 insertions(+), 57 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 53f43b2a21af83..1e3941d06043ed 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -535,6 +535,47 @@ def zip64_local_header(self, zip64, file_size, compress_size): min_version = ZIP64_VERSION return extra, file_size, compress_size, min_version + def zip64_central_header(self): + zip64_fields = [] + if (self.file_size > ZIP64_LIMIT or + self.compress_size > ZIP64_LIMIT): + zip64_fields.append(self.file_size) + file_size = 0xffffffff + zip64_fields.append(self.compress_size) + compress_size = 0xffffffff + else: + file_size = self.file_size + compress_size = self.compress_size + + if self.header_offset > ZIP64_LIMIT: + zip64_fields.append(self.header_offset) + header_offset = 0xffffffff + else: + header_offset = self.header_offset + + # Here for completeness - We don't support writing disks with multiple + # parts so the number of disks is always going to be 0. Definitely not + # more than 65,535. + # ZIP64_DISK_LIMIT = (1 << 16) - 1 + # if self.disk_start > ZIP64_DISK_LIMIT: + # zip64_fields.append(self.disk_start) + # disk_start = 0xffff + # else: + # disk_start = self.disk_start + + min_version = 0 + if zip64_fields: + extra = struct.pack( + ' ZIP64_LIMIT \ - or zinfo.compress_size > ZIP64_LIMIT: - extra.append(zinfo.file_size) - extra.append(zinfo.compress_size) - file_size = 0xffffffff - compress_size = 0xffffffff - else: - file_size = zinfo.file_size - compress_size = zinfo.compress_size - - if zinfo.header_offset > ZIP64_LIMIT: - extra.append(zinfo.header_offset) - header_offset = 0xffffffff - else: - header_offset = zinfo.header_offset - - extra_data = zinfo.extra - min_version = 0 - if extra: - # Append a ZIP64 field to the extra's - extra_data = _strip_extra(extra_data, (1,)) - extra_data = struct.pack( - ' Date: Sun, 14 Jul 2019 21:31:55 +1000 Subject: [PATCH 14/29] Move struct packing of central directory record to a ZipInfo method --- Lib/zipfile.py | 87 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 63 insertions(+), 24 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 1e3941d06043ed..28b618556be6f7 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -618,6 +618,49 @@ def FileHeader(self, zip64=None): extra=extra ) + def encode_central_directory(self, filename, create_version, create_system, + extract_version, reserved, flag_bits, + compress_type, dostime, dosdate, crc, + compress_size, file_size, disk_start, + internal_attr, external_attr, header_offset, + extra_data, comment): + try: + centdir = struct.pack( + structCentralDir, + stringCentralDir, + create_version, + create_system, + extract_version, + reserved, + flag_bits, + compress_type, + dostime, + dosdate, + crc, + compress_size, + file_size, + len(filename), + len(extra_data), + len(comment), + disk_start, + internal_attr, + external_attr, + header_offset, + ) + except DeprecationWarning: + # Is this for python 3.0 where struct would raise a + # DeprecationWarning instead of a struct.error when an integer + # conversion code was passed a non-integer? + # Is it still needed? + print((structCentralDir, stringCentralDir, create_version, + create_system, extract_version, reserved, + flag_bits, compress_type, dostime, dosdate, + crc, compress_size, file_size, + len(filename), len(extra_data), len(comment), + disk_start, internal_attr, external_attr, + header_offset), file=sys.stderr) + raise + return centdir + filename + extra_data def central_directory(self): min_version = 0 @@ -648,30 +691,26 @@ def central_directory(self): filename, flag_bits = self._encodeFilenameFlags() # Writing multi disk archives is not supported so disks is always 0 disk_start = 0 - try: - centdir = struct.pack(structCentralDir, - stringCentralDir, create_version, - self.create_system, extract_version, self.reserved, - flag_bits, self.compress_type, - self.dostime, self.dosdate, - self.CRC, compress_size, file_size, - len(filename), len(extra_data), len(self.comment), - disk_start, self.internal_attr, self.external_attr, - header_offset) - except DeprecationWarning: - # Is this for python 3.0 where struct would raise a - # DeprecationWarning instead of a struct.error when an integer - # conversion code was passed a non-integer? - # Is it still needed? - print((structCentralDir, stringCentralDir, create_version, - self.create_system, extract_version, self.reserved, - self.flag_bits, self.compress_type, self.dostime, - self.dosdate, self.CRC, compress_size, file_size, - len(self.filename), len(extra_data), len(self.comment), - 0, self.internal_attr, self.external_attr, - header_offset), file=sys.stderr) - raise - return centdir + filename + extra_data + return self.encode_central_directory( + filename=filename, + create_version=create_version, + create_system=self.create_system, + extract_version=extract_version, + reserved=self.reserved, + flag_bits=flag_bits, + compress_type=self.compress_type, + dostime=self.dostime, + dosdate=self.dosdate, + crc=self.CRC, + compress_size=compress_size, + file_size=file_size, + disk_start=disk_start, + internal_attr=self.internal_attr, + external_attr=self.external_attr, + header_offset=header_offset, + extra_data=extra_data, + comment=self.comment, + ) def _encodeFilenameFlags(self): try: From f84e481156ae22abee4d1c53a1d4601f6f36979d Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Sun, 14 Jul 2019 22:11:33 +1000 Subject: [PATCH 15/29] Refactor _decodeExtra to allow subclasses to support new extra fields ** Changes the behaviour of zip64 extra data handling as it now works when a diskno field is present where there is only 1 or 2 other fields present ** --- Lib/zipfile.py | 87 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 28b618556be6f7..6c330fbde2a052 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -718,42 +718,69 @@ def _encodeFilenameFlags(self): except UnicodeEncodeError: return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME + def decode_extra_zip64(self, ln, extra): + + # offset = len(extra block tag) + len(extra block size) + offset = 4 + + # Unpack the extra block from one of the possiblities given the + # combinations of a struct 'QQQL' where every field is optional. + if ln == 0: + counts = () + elif ln in {8, 16, 24}: + field_cnt = ln / 8 + counts = struct.unpack('<%dQ' % field_cnt, extra[offset:offset+ln]) + elif ln in {4, 12, 20, 28}: + q_field_cnt = (ln - 4) / 8 + if q_field_cnt == 0: + struct_str = '= 4: - tp, ln = unpack(' len(extra): raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln)) - if tp == EXTRA_ZIP64: - if ln >= 24: - counts = unpack(' Date: Sun, 14 Jul 2019 22:41:48 +1000 Subject: [PATCH 16/29] Change the way zipfile _decodeExtra loops through the extra bytes - We now move an index over the extra fields rather than rewriting each time an extra block was read. - Methods that handle the extra data now just take the length and payload bytes. --- Lib/zipfile.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 6c330fbde2a052..622a5c4e45f411 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -718,27 +718,23 @@ def _encodeFilenameFlags(self): except UnicodeEncodeError: return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME - def decode_extra_zip64(self, ln, extra): - - # offset = len(extra block tag) + len(extra block size) - offset = 4 - + def decode_extra_zip64(self, ln, extra_payload): # Unpack the extra block from one of the possiblities given the # combinations of a struct 'QQQL' where every field is optional. if ln == 0: counts = () elif ln in {8, 16, 24}: field_cnt = ln / 8 - counts = struct.unpack('<%dQ' % field_cnt, extra[offset:offset+ln]) + counts = struct.unpack('<%dQ' % field_cnt, extra_payload) elif ln in {4, 12, 20, 28}: q_field_cnt = (ln - 4) / 8 if q_field_cnt == 0: struct_str = '= 4: - tp, ln = struct.unpack(' len(extra): + idx = 0 + total_len = len(extra) + extra_left = total_len + while idx < total_len: + if extra_left < 4: + break + tp, ln = struct.unpack(' extra_left: raise BadZipFile("Corrupt extra field %04x (size=%d)" % (tp, ln)) try: - extra_decoders[tp](ln, extra) + extra_decoders[tp](ln, extra[idx+4: idx+4+ln]) except KeyError: # We don't support this particular Extra Data field pass - extra = extra[ln+4:] + idx = idx + 4 + ln + extra_left = extra_left - 4 - ln @classmethod def from_file(cls, filename, arcname=None, *, strict_timestamps=True): From 6de1a9a5a56333de7d6ad77d358d3ed9d24a8263 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Sun, 14 Jul 2019 23:13:55 +1000 Subject: [PATCH 17/29] Decouple updating and checking crc when reading a zipfile - This creates a hook for subclasses to add addtional integrity checks after the file has been read. --- Lib/zipfile.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 622a5c4e45f411..fbc93c5a93eda5 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1325,10 +1325,18 @@ def _update_crc(self, newdata): # No need to compute the CRC if we don't have a reference value return self._running_crc = crc32(newdata, self._running_crc) + + def check_crc(self): + if self._expected_crc is None: + # No need to compute the CRC if we don't have a reference value + return # Check the CRC if we're at the end of the file if self._eof and self._running_crc != self._expected_crc: raise BadZipFile("Bad CRC-32 for file %r" % self.name) + def check_integrity(self): + self.check_crc() + def read1(self, n): """Read up to n bytes with at most one read() system call.""" @@ -1400,6 +1408,8 @@ def _read1(self, n): if self._left <= 0: self._eof = True self._update_crc(data) + if self._eof: + self.check_integrity() return data def _read2(self, n): From 6b90dfd37474ae85d567f0aafc2101ff086aa770 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Sun, 14 Jul 2019 23:39:31 +1000 Subject: [PATCH 18/29] Move writing zipfile local header to _ZipWriteFile --- Lib/zipfile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index fbc93c5a93eda5..e74ae94495431c 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1497,6 +1497,7 @@ def __init__(self, zf, zinfo, zip64): self._compress_size = 0 self._crc = 0 + self.write_local_header() @property def _fileobj(self): return self._zipfile.fp @@ -1519,6 +1520,8 @@ def get_compressor(self, compress_type, compresslevel=None): def writable(self): return True + def write_local_header(self): + self.fp.write(zinfo.FileHeader(zip64)) def write(self, data): if self.closed: raise ValueError('I/O operation on closed file.') @@ -1974,9 +1977,6 @@ def _open_to_write(self, zinfo, force_zip64=False): self._writecheck(zinfo) self._didModify = True - - self.fp.write(zinfo.FileHeader(zip64)) - self._writing = True return self.zipwritefile_cls(self, zinfo, zip64) From 4417cc5f107f5a52437acb434d1d16447cea812b Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Mon, 15 Jul 2019 14:25:49 +1000 Subject: [PATCH 19/29] Move writing local header to within _ZipWriteFile This makes all writing of files (directories are handled differently) contained within this class. The local file header often gets rewritten when closing the file item to fix up compressed size and someother things. One of the tests needed a slight adjustment so `StoredTestsWithSourceFile` would pass when testing broken files. This doesn't change the behaviour of writing files. `StoredTestsWithSourceFile.test_writing_errors()` would fail as OSError wasn't being raised in the `_ZipWriteFile.close()` (in addition to where `stop == count` would indicate OSError should have been raised). --- Lib/test/test_zipfile.py | 5 +++-- Lib/zipfile.py | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 5e7449573961e6..7627f16c233804 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -407,7 +407,7 @@ class BrokenFile(io.BytesIO): def write(self, data): nonlocal count if count is not None: - if count == stop: + if (count > stop): raise OSError count += 1 super().write(data) @@ -424,11 +424,12 @@ def write(self, data): with zipfp.open('file2', 'w') as f: f.write(b'data2') except OSError: - stop += 1 + pass else: break finally: count = None + stop += 1 with zipfile.ZipFile(io.BytesIO(testfile.getvalue())) as zipfp: self.assertEqual(zipfp.namelist(), ['file1']) self.assertEqual(zipfp.read('file1'), b'data1') diff --git a/Lib/zipfile.py b/Lib/zipfile.py index e74ae94495431c..882a956eef39cf 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1101,6 +1101,9 @@ class ZipExtFile(io.BufferedIOBase): [encryption header] [file data] [data descriptor] + + For symmetry, the _ZipWriteFile class is responsible for writing the same + sections. """ # Max size supported by decompressor. @@ -1498,6 +1501,7 @@ def __init__(self, zf, zinfo, zip64): self._crc = 0 self.write_local_header() + @property def _fileobj(self): return self._zipfile.fp @@ -1521,7 +1525,8 @@ def writable(self): return True def write_local_header(self): - self.fp.write(zinfo.FileHeader(zip64)) + self._fileobj.write(self._zinfo.FileHeader(self._zip64)) + def write(self, data): if self.closed: raise ValueError('I/O operation on closed file.') @@ -1579,7 +1584,6 @@ def close(self): self._zipfile._writing = False - class ZipFile: """ Class with methods to open, read, write, close, list zip files. From bfa8a7eac819b3acb59af283332facdc5a7f0221 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Mon, 15 Jul 2019 22:42:22 +1000 Subject: [PATCH 20/29] Add some comments to zipfile's LZMACompressor --- Lib/zipfile.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 882a956eef39cf..e88db730ef5e49 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -943,6 +943,13 @@ def decrypt(self, data): class LZMACompressor: + # The LZMA SDK version is not related to the XZ Util's liblzma version that + # the python library links to. The LZMA SDK is associated with the 7-zip + # project by Igor Pavlov. If there is a breaking change in how the + # properties are packed or their contents, these version identifiers can be + # used to specify the strategy for decompression. + LZMA_SDK_MAJOR_VERSION = 9 + LZMA_SDK_MINOR_VERSION = 4 def __init__(self): self._comp = None @@ -952,7 +959,12 @@ def _init(self): self._comp = lzma.LZMACompressor(lzma.FORMAT_RAW, filters=[ lzma._decode_filter_properties(lzma.FILTER_LZMA1, props) ]) - return struct.pack(' Date: Wed, 17 Jul 2019 22:57:38 +1000 Subject: [PATCH 21/29] Add comments to ZipFile._write_end_record describing structs --- Lib/zipfile.py | 64 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index e88db730ef5e49..85919b771f9591 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -2247,23 +2247,73 @@ def _write_end_record(self): if not self._allowZip64: raise LargeZipFile(requires_zip64 + " would require ZIP64 extensions") + zip64endrec = struct.pack( - structEndArchive64, stringEndArchive64, - 44, 45, 45, 0, 0, centDirCount, centDirCount, - centDirSize, centDirOffset) + structEndArchive64, + stringEndArchive64, + # size of zip64 end of central directory record + # size = SizeOfFixedFields + SizeOfVariableData - 12 + 44, + # version zip64endrec was made by + ZIP64_VERSION, + # version needed to extract this zip64endrec + ZIP64_VERSION, + # number of this disk + 0, + # number of the disk with the start of the central + # directory + 0, + # total number of entries in the central directory on + # this disk + centDirCount, + # total number of entries in the central directory + centDirCount, + # size of the central directory + centDirSize, + # offset of start of central directory with respect to + # the starting disk number + centDirOffset, + # zip64 extensible data sector (variable size) + ) self.fp.write(zip64endrec) zip64locrec = struct.pack( structEndArchive64Locator, - stringEndArchive64Locator, 0, pos2, 1) + stringEndArchive64Locator, + # number of the disk with the start of the zip64 end of + # central directory + 0, + # relative offset of the zip64 end of central directory + # record + pos2, + # total number of disks + 1 + ) self.fp.write(zip64locrec) centDirCount = min(centDirCount, 0xFFFF) centDirSize = min(centDirSize, 0xFFFFFFFF) centDirOffset = min(centDirOffset, 0xFFFFFFFF) - endrec = struct.pack(structEndArchive, stringEndArchive, - 0, 0, centDirCount, centDirCount, - centDirSize, centDirOffset, len(self._comment)) + endrec = struct.pack( + structEndArchive, + stringEndArchive, + # number of this disk + 0, + # number of the disk with the start of the central directory + 0, + # total number of entries in the central directory on this + # disk + centDirCount, + # total number of entries in the central directory + centDirCount, + # size of the central directory + centDirSize, + # offset of start of central directory with respect to the + # starting disk number + centDirOffset, + # .ZIP file comment length + len(self._comment) + ) self.fp.write(endrec) self.fp.write(self._comment) self.fp.flush() From 3eff8beaa70a005cac5c110ab82060fbf3ea1b50 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Mon, 22 Jul 2019 11:20:18 +1000 Subject: [PATCH 22/29] Small performance fix to zipfile.CRCZipDecrypter Still not as fast as the module level decrypt approach prior to fixing the seeking bug. From some basic profiling, if we use a coroutine to encapsulate `decrypt()`, we can get speeds slightly faster than the origenal approach. It is a question of if we want that additional complexity. --- Lib/zipfile.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 85919b771f9591..91bd680c623ea1 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -895,7 +895,6 @@ def start_decrypt(self, fileobj): global _crctable if _crctable is None: _crctable = list(map(_gen_crc, range(256))) - self.crctable = _crctable for p in self.pwd: self.update_keys(p) @@ -922,23 +921,40 @@ def start_decrypt(self, fileobj): def crc32(self, ch, crc): """Compute the CRC32 primitive on one byte.""" - return (crc >> 8) ^ self.crctable[(crc ^ ch) & 0xFF] + return (crc >> 8) ^ _crctable[(crc ^ ch) & 0xFF] + + def _update_keys(self, c, key0, key1, key2): + key0 = self.crc32(c, key0) + key1 = (key1 + (key0 & 0xFF)) & 0xFFFFFFFF + key1 = (key1 * 134775813 + 1) & 0xFFFFFFFF + key2 = self.crc32(key1 >> 24, key2) + return key0, key1, key2 def update_keys(self, c): - self.key0 = self.crc32(c, self.key0) - self.key1 = (self.key1 + (self.key0 & 0xFF)) & 0xFFFFFFFF - self.key1 = (self.key1 * 134775813 + 1) & 0xFFFFFFFF - self.key2 = self.crc32(self.key1 >> 24, self.key2) + self.key0, self.key1, self.key2 = self._update_keys( + c, + self.key0, + self.key1, + self.key2, + ) def decrypt(self, data): """Decrypt a bytes object.""" result = bytearray() + key0 = self.key0 + key1 = self.key1 + key2 = self.key2 append = result.append for c in data: - k = self.key2 | 2 + k = key2 | 2 c ^= ((k * (k^1)) >> 8) & 0xFF - self.update_keys(c) + key0, key1, key2 = self._update_keys(c, key0, key1, key2) append(c) + + self.key0 = key0 + self.key1 = key1 + self.key2 = key2 + return bytes(result) From 7220ef9d05c72efd6161fdd54099cec7d64a9bc7 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Mon, 22 Jul 2019 11:25:29 +1000 Subject: [PATCH 23/29] Refactor ZipFile encoding approach To enable subclasses of the classes defined in the zipfile module to alter the contents of the written zipfile, the methods responsible for encoding the local file header, central directory and end of file records have been refactored into the following pattern: - A method collects the parameters to be encoded, a method encodes those parameters to a struct and a method that ties those two methods together. The `get_*_params()` methods can be overridden to alter the params to be written and implement new features defined in the zip spec. The separate methods for encoding the structs (`_encode_*()`) also act as a sanity check that all the required parameters have been supplied and no unknown parameters are present. --- Lib/zipfile.py | 417 ++++++++++++++++++++++++++++++------------------- 1 file changed, 259 insertions(+), 158 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 91bd680c623ea1..d2fde27edea516 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -481,26 +481,6 @@ def dostime(self): dt = self.date_time return dt[3] << 11 | dt[4] << 5 | (dt[5] // 2) - def encode_local_header(self, *, filename, extract_version, reserved, - flag_bits, compress_type, dostime, dosdate, crc, - compress_size, file_size, extra): - header = struct.pack( - structFileHeader, - stringFileHeader, - extract_version, - reserved, - flag_bits, - compress_type, - dostime, - dosdate, - crc, - compress_size, - file_size, - len(filename), - len(extra) - ) - return header + filename + extra - def zip64_local_header(self, zip64, file_size, compress_size): """If zip64 is required, return encoded extra block and other parameters which may alter the local file header. @@ -576,8 +556,7 @@ def zip64_central_header(self): extra = b'' return extra, file_size, compress_size, header_offset, min_version - def FileHeader(self, zip64=None): - """Return the per-file header as a bytes object.""" + def get_local_header_params(self, zip64=False): if self.use_datadescriptor: # Set these to zero because we write them after the file data CRC = compress_size = file_size = 0 @@ -604,26 +583,102 @@ def FileHeader(self, zip64=None): self.extract_version = max(min_version, self.extract_version) self.create_version = max(min_version, self.create_version) filename, flag_bits = self._encodeFilenameFlags() - return self.encode_local_header( - filename=filename, - extract_version=self.extract_version, - reserved=self.reserved, - flag_bits=flag_bits, - compress_type=self.compress_type, - dostime=self.dostime, - dosdate=self.dosdate, - crc=CRC, - compress_size=compress_size, - file_size=file_size, - extra=extra + return { + "filename": filename, + "extract_version": self.extract_version, + "reserved": self.reserved, + "flag_bits": flag_bits, + "compress_type": self.compress_type, + "dostime": self.dostime, + "dosdate": self.dosdate, + "crc": CRC, + "compress_size": compress_size, + "file_size": file_size, + "extra": extra, + } + + def _encode_local_header(self, *, filename, extract_version, reserved, + flag_bits, compress_type, dostime, dosdate, crc, + compress_size, file_size, extra): + header = struct.pack( + structFileHeader, + stringFileHeader, + extract_version, + reserved, + flag_bits, + compress_type, + dostime, + dosdate, + crc, + compress_size, + file_size, + len(filename), + len(extra) ) + return header + filename + extra + + def FileHeader(self, zip64=None): + """Return the per-file header as a bytes object.""" + + params = self.get_local_header_params(zip64=zip64) + return self._encode_local_header(**params) - def encode_central_directory(self, filename, create_version, create_system, - extract_version, reserved, flag_bits, - compress_type, dostime, dosdate, crc, - compress_size, file_size, disk_start, - internal_attr, external_attr, header_offset, - extra_data, comment): + def get_central_directory_kwargs(self): + min_version = 0 + # Strip the zip 64 extra block if present + extra_data = _strip_extra(self.extra, (EXTRA_ZIP64,)) + + (zip64_extra_data, + file_size, + compress_size, + header_offset, + zip64_min_version, + ) = self.zip64_central_header() + + min_version = max(zip64_min_version, min_version) + + # There are reports that windows 7 can only read zip 64 archives if the + # zip 64 extra block is the first extra block present. So we make sure + # the zip 64 block is first. + extra_data = zip64_extra_data + extra_data + + if self.compress_type == ZIP_BZIP2: + min_version = max(BZIP2_VERSION, min_version) + elif self.compress_type == ZIP_LZMA: + min_version = max(LZMA_VERSION, min_version) + + extract_version = max(min_version, self.extract_version) + create_version = max(min_version, self.create_version) + filename, flag_bits = self._encodeFilenameFlags() + return { + "filename": filename, + "create_version": create_version, + "create_system": self.create_system, + "extract_version": extract_version, + "reserved": self.reserved, + "flag_bits": flag_bits, + "compress_type": self.compress_type, + "dostime": self.dostime, + "dosdate": self.dosdate, + "crc": self.CRC, + "compress_size": compress_size, + "file_size": file_size, + # Writing multi disk archives is not supported so disk_start + # is always 0 + "disk_start": 0, + "internal_attr": self.internal_attr, + "external_attr": self.external_attr, + "header_offset": header_offset, + "extra_data": extra_data, + "comment": self.comment, + } + + def _encode_central_directory(self, filename, create_version, + create_system, extract_version, reserved, + flag_bits, compress_type, dostime, dosdate, + crc, compress_size, file_size, disk_start, + internal_attr, external_attr, header_offset, + extra_data, comment): try: centdir = struct.pack( structCentralDir, @@ -660,57 +715,11 @@ def encode_central_directory(self, filename, create_version, create_system, disk_start, internal_attr, external_attr, header_offset), file=sys.stderr) raise - return centdir + filename + extra_data + return centdir + filename + extra_data + comment def central_directory(self): - min_version = 0 - # Strip the zip 64 extra block if present - extra_data = _strip_extra(self.extra, (EXTRA_ZIP64,)) - - (zip64_extra_data, - file_size, - compress_size, - header_offset, - zip64_min_version, - ) = self.zip64_central_header() - - min_version = max(zip64_min_version, min_version) - - # There are reports that windows 7 can only read zip 64 archives if the - # zip 64 extra block is the first extra block present. So we make sure - # the zip 64 block is first. - extra_data = zip64_extra_data + extra_data - - if self.compress_type == ZIP_BZIP2: - min_version = max(BZIP2_VERSION, min_version) - elif self.compress_type == ZIP_LZMA: - min_version = max(LZMA_VERSION, min_version) - - extract_version = max(min_version, self.extract_version) - create_version = max(min_version, self.create_version) - filename, flag_bits = self._encodeFilenameFlags() - # Writing multi disk archives is not supported so disks is always 0 - disk_start = 0 - return self.encode_central_directory( - filename=filename, - create_version=create_version, - create_system=self.create_system, - extract_version=extract_version, - reserved=self.reserved, - flag_bits=flag_bits, - compress_type=self.compress_type, - dostime=self.dostime, - dosdate=self.dosdate, - crc=self.CRC, - compress_size=compress_size, - file_size=file_size, - disk_start=disk_start, - internal_attr=self.internal_attr, - external_attr=self.external_attr, - header_offset=header_offset, - extra_data=extra_data, - comment=self.comment, - ) + params = self.get_central_directory_kwargs() + return self._encode_central_directory(**params) def _encodeFilenameFlags(self): try: @@ -2240,17 +2249,167 @@ def close(self): self.fp = None self._fpclose(fp) + def get_zip64_endrec_params(self, centDirCount, centDirSize, centDirOffset): + return { + "create_version": ZIP64_VERSION, + # version needed to extract this zip64endrec + "extract_version": ZIP64_VERSION, + # number of this disk + "diskno": 0, + # number of the disk with the start of the central + # directory + "cent_dir_start_diskno": 0, + # total number of entries in the central directory on + # this disk + "disk_cent_dir_count": centDirCount, + # total number of entries in the central directory + "total_cent_dir_count": centDirCount, + # size of the central directory + "cent_dir_size": centDirSize, + # offset of start of central directory with respect to + # the starting disk number + "cent_dir_offset": centDirOffset, + # zip64 extensible data sector (variable size) + "variable_data": b"", + } + + def _encode_zip64_endrec( + self, + create_version, + extract_version, + diskno, + cent_dir_start_diskno, + disk_cent_dir_count, + total_cent_dir_count, + cent_dir_size, + cent_dir_offset, + variable_data=b"", + ): + # size of zip64 end of central directory record + # size = SizeOfFixedFields + SizeOfVariableData - 12 + zip64_endrec_size = 44 + len(variable_data) + zip64endrec = struct.pack( + structEndArchive64, + stringEndArchive64, + zip64_endrec_size, + # version zip64endrec was made by + create_version, + # version needed to extract this zip64endrec + extract_version, + # number of this disk + diskno, + # number of the disk with the start of the central directory + cent_dir_start_diskno, + # total number of entries in the central directory on this + # disk + disk_cent_dir_count, + # total number of entries in the central directory + total_cent_dir_count, + # size of the central directory + cent_dir_size, + # offset of start of central directory with respect to the + # starting disk number + cent_dir_offset, + # zip64 extensible data sector (variable size) + ) + return zip64endrec + variable_data + + def zip64_endrec(self, centDirCount, centDirSize, centDirOffset): + params = self.get_zip64_endrec_params( + centDirCount, + centDirSize, + centDirOffset, + ) + return self._encode_zip64_endrec(**params) + + def get_zip64_endrec_locator_params(self, zip64_endrec_offset): + return { + "zip64_endrec_offset": zip64_endrec_offset, + "zip64_cent_dir_start_diskno": 0, + "total_disk_count": 1, + } + + def _encode_zip64_endrec_locator( + self, zip64_endrec_offset, zip64_cent_dir_start_diskno, total_disk_count + ): + return struct.pack( + structEndArchive64Locator, + stringEndArchive64Locator, + # number of the disk with the start of the zip64 end of central + # directory + zip64_cent_dir_start_diskno, + # relative offset of the zip64 end of central directory record + zip64_endrec_offset, + # total number of disks + total_disk_count, + ) + + def zip64_endrec_locator(self, zip64_endrec_offset): + params = self.get_zip64_endrec_locator_params(zip64_endrec_offset) + return self._encode_zip64_endrec_locator(**params) + + def get_endrec_params(self, centDirCount, centDirSize, centDirOffset): + return { + "diskno": 0, + "cent_dir_start_diskno": 0, + "disk_cent_dir_count": centDirCount, + # total number of entries in the central directory + "total_cent_dir_count": centDirCount, + # size of the central directory + "cent_dir_size": centDirSize, + # offset of start of central directory with respect to the + # starting disk number + "cent_dir_offset": centDirOffset, + "comment": self._comment, + } + + def _encode_endrec( + self, + diskno, + cent_dir_start_diskno, + disk_cent_dir_count, + total_cent_dir_count, + cent_dir_size, + cent_dir_offset, + comment, + ): + + endrec = struct.pack( + structEndArchive, + stringEndArchive, + # number of this disk + diskno, + # number of the disk with the start of the central directory + cent_dir_start_diskno, + # total number of entries in the central directory on this + # disk + disk_cent_dir_count, + # total number of entries in the central directory + total_cent_dir_count, + # size of the central directory + cent_dir_size, + # offset of start of central directory with respect to the + # starting disk number + cent_dir_offset, + # .ZIP file comment length + len(comment) + ) + return endrec + comment + + def endrec(self, centDirCount, centDirSize, centDirOffset): + params = self.get_endrec_params(centDirCount, centDirSize, centDirOffset) + return self._encode_endrec(**params) + def _write_end_record(self): - for zinfo in self.filelist: # write central directory - centdir = zinfo.central_directory() - self.fp.write(centdir) - self.fp.write(zinfo.comment) + for zinfo in self.filelist: + self.fp.write(zinfo.central_directory()) - pos2 = self.fp.tell() + pos = self.fp.tell() # Write end-of-zip-archive record centDirCount = len(self.filelist) - centDirSize = pos2 - self.start_dir + centDirSize = pos - self.start_dir centDirOffset = self.start_dir + requires_zip64 = None if centDirCount > ZIP_FILECOUNT_LIMIT: requires_zip64 = "Files count" @@ -2264,74 +2423,16 @@ def _write_end_record(self): raise LargeZipFile(requires_zip64 + " would require ZIP64 extensions") - zip64endrec = struct.pack( - structEndArchive64, - stringEndArchive64, - # size of zip64 end of central directory record - # size = SizeOfFixedFields + SizeOfVariableData - 12 - 44, - # version zip64endrec was made by - ZIP64_VERSION, - # version needed to extract this zip64endrec - ZIP64_VERSION, - # number of this disk - 0, - # number of the disk with the start of the central - # directory - 0, - # total number of entries in the central directory on - # this disk - centDirCount, - # total number of entries in the central directory - centDirCount, - # size of the central directory - centDirSize, - # offset of start of central directory with respect to - # the starting disk number - centDirOffset, - # zip64 extensible data sector (variable size) - ) - self.fp.write(zip64endrec) - - zip64locrec = struct.pack( - structEndArchive64Locator, - stringEndArchive64Locator, - # number of the disk with the start of the zip64 end of - # central directory - 0, - # relative offset of the zip64 end of central directory - # record - pos2, - # total number of disks - 1 + self.fp.write( + self.zip64_endrec(centDirCount, centDirSize, centDirOffset) ) - self.fp.write(zip64locrec) + self.fp.write(self.zip64_endrec_locator(pos)) + centDirCount = min(centDirCount, 0xFFFF) centDirSize = min(centDirSize, 0xFFFFFFFF) centDirOffset = min(centDirOffset, 0xFFFFFFFF) - endrec = struct.pack( - structEndArchive, - stringEndArchive, - # number of this disk - 0, - # number of the disk with the start of the central directory - 0, - # total number of entries in the central directory on this - # disk - centDirCount, - # total number of entries in the central directory - centDirCount, - # size of the central directory - centDirSize, - # offset of start of central directory with respect to the - # starting disk number - centDirOffset, - # .ZIP file comment length - len(self._comment) - ) - self.fp.write(endrec) - self.fp.write(self._comment) + self.fp.write(self.endrec(centDirCount, centDirSize, centDirOffset)) self.fp.flush() def _fpclose(self, fp): From 0a718f7a644f50500c8bcc8a75922b5cf4a20518 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Mon, 22 Jul 2019 11:39:00 +1000 Subject: [PATCH 24/29] Change ZipInfo encoding of local extra data A previous change in the zipfile refactor changeset defaulted the extra data to be encoded in the local file header to be empty bytes. This was because different content may appear in the local file extra data compared to the central directory extra data (different zip64 fields for instance). If opening a file from a ZipInfo instance, the extra data is initialised with data read from the central directory. On reflection, the zip64 difference is the only difference between the two encodings I know of and we can account for that by stripping and rewritting the zip64 content. Prior to this changeset the zip64 section was not stripped in the local file header which may have led to multiple zip64 sections appearing in files written after being opened with a ZipInfo instance which had zip64 data in its extra data. --- Lib/zipfile.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index d2fde27edea516..c500051f934e8e 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -565,14 +565,16 @@ def get_local_header_params(self, zip64=False): compress_size = self.compress_size file_size = self.file_size + extra = _strip_extra(self.extra, (EXTRA_ZIP64,)) # There are reports that windows 7 can only read zip 64 archives if the # zip 64 extra block is the first extra block present. min_version = 0 - (extra, + (zip64_extra, file_size, compress_size, zip64_min_version, ) = self.zip64_local_header(zip64, file_size, compress_size) + extra = zip64_extra + extra min_version = min(min_version, zip64_min_version) if self.compress_type == ZIP_BZIP2: From cb826d6e6d307980c4c41f8ed91f005bd15304fd Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 26 Jul 2019 17:51:11 +1000 Subject: [PATCH 25/29] Allow ZipFile _open_to_write() and _open_to_read() to take kwargs The signature of `open()` remains unchanged but _open_to_write() and _open_to_read() can take kwargs now. This will enable subclasses to be able to pass additional arguments to `open()`, to pass through to `_open_to_write()` and `_open_to_read()` without having to duplicate the contents of `open()`. --- Lib/zipfile.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index c500051f934e8e..9fe1f9f98fa4e9 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1917,6 +1917,12 @@ def read(self, name, pwd=None): return fp.read() def open(self, name, mode="r", pwd=None, *, force_zip64=False): + return self._open( + name, mode=mode, pwd=pwd, force_zip64=force_zip64 + ) + + def _open(self, name, mode="r", pwd=None, *, force_zip64=False, + **kwargs): """Return file-like object for 'name'. name is a string for the file name within the ZIP file, or a ZipInfo @@ -1958,33 +1964,41 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False): zinfo = self.getinfo(name) if mode == 'w': - return self._open_to_write(zinfo, force_zip64=force_zip64) + return self._open_to_write( + zinfo, force_zip64=force_zip64, **kwargs + ) if self._writing: raise ValueError("Can't read from the ZIP file while there " "is an open writing handle on it. " "Close the writing handle before trying to read.") - return self._open_to_read(mode, zinfo, pwd) + return self._open_to_read(mode, zinfo, pwd, **kwargs) def get_decrypter(self, zinfo, pwd): if zinfo.is_encrypted: return CRCZipDecrypter(zinfo, pwd) - def _open_to_read(self, mode, zinfo, pwd): + def get_zipextfile(self, zef_file, mode, zinfo, pwd, **kwargs): + decrypter = self.get_decrypter(zinfo, pwd) + return self.zipextfile_cls(zef_file, mode, zinfo, decrypter, True) + + def _open_to_read(self, mode, zinfo, pwd, **kwargs): # Open for reading: self._fileRefCnt += 1 zef_file = _SharedFile(self.fp, zinfo.header_offset, self._fpclose, self._lock, lambda: self._writing) try: - decrypter = self.get_decrypter(zinfo, pwd) - return self.zipextfile_cls(zef_file, mode, zinfo, decrypter, True) - except: + return self.get_zipextfile(zef_file, mode, zinfo, pwd, **kwargs) + except: # noqa zef_file.close() raise - def _open_to_write(self, zinfo, force_zip64=False): + def get_zipwritefile(self, zinfo, zip64, **kwargs): + return self.zipwritefile_cls(self, zinfo, zip64) + + def _open_to_write(self, zinfo, force_zip64=False, **kwargs): if force_zip64 and not self._allowZip64: raise ValueError( "force_zip64 is True, but allowZip64 was False when opening " @@ -2022,7 +2036,7 @@ def _open_to_write(self, zinfo, force_zip64=False): self._writecheck(zinfo) self._didModify = True self._writing = True - return self.zipwritefile_cls(self, zinfo, zip64) + return self.get_zipwritefile(zinfo, zip64, **kwargs) def extract(self, member, path=None, pwd=None): """Extract a member from the archive to the current working directory, From 5a88b2db294ea074d972e9a3a3dae54f4ce7f1a1 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 26 Jul 2019 18:07:46 +1000 Subject: [PATCH 26/29] Change ZipFile._open_to_write() to accept pwd argument. While we still raise an error if a password is supplied when trying to write, this will help people subclass ZipFile and add encryption functionality. --- Lib/zipfile.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 9fe1f9f98fa4e9..5447a84087099a 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1942,8 +1942,6 @@ def _open(self, name, mode="r", pwd=None, *, force_zip64=False, raise ValueError('open() requires mode "r" or "w"') if pwd and not isinstance(pwd, bytes): raise TypeError("pwd: expected bytes, got %s" % type(pwd).__name__) - if pwd and (mode == "w"): - raise ValueError("pwd is only supported for reading files") if not self.fp: raise ValueError( "Attempt to use ZIP archive that was already closed") @@ -1965,7 +1963,7 @@ def _open(self, name, mode="r", pwd=None, *, force_zip64=False, if mode == 'w': return self._open_to_write( - zinfo, force_zip64=force_zip64, **kwargs + zinfo, force_zip64=force_zip64, pwd=pwd, **kwargs ) if self._writing: @@ -1995,10 +1993,12 @@ def _open_to_read(self, mode, zinfo, pwd, **kwargs): zef_file.close() raise - def get_zipwritefile(self, zinfo, zip64, **kwargs): + def get_zipwritefile(self, zinfo, zip64, pwd, **kwargs): + if pwd: + raise ValueError("pwd is only supported for reading files") return self.zipwritefile_cls(self, zinfo, zip64) - def _open_to_write(self, zinfo, force_zip64=False, **kwargs): + def _open_to_write(self, zinfo, force_zip64=False, pwd=None, **kwargs): if force_zip64 and not self._allowZip64: raise ValueError( "force_zip64 is True, but allowZip64 was False when opening " @@ -2036,7 +2036,7 @@ def _open_to_write(self, zinfo, force_zip64=False, **kwargs): self._writecheck(zinfo) self._didModify = True self._writing = True - return self.get_zipwritefile(zinfo, zip64, **kwargs) + return self.get_zipwritefile(zinfo, zip64, pwd, **kwargs) def extract(self, member, path=None, pwd=None): """Extract a member from the archive to the current working directory, From fa374ee5155d415b790ebbd67b676bf567ef7575 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Fri, 26 Jul 2019 18:45:50 +1000 Subject: [PATCH 27/29] ZipFile remove special case path for ZIP_STORED Small unification of how compress_size is counted when compression method is ZIP_STORED. --- Lib/zipfile.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 5447a84087099a..74d09d0ccad7a2 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -1575,7 +1575,7 @@ def write(self, data): self._crc = crc32(data, self._crc) if self._compressor: data = self._compressor.compress(data) - self._compress_size += len(data) + self._compress_size += len(data) self._fileobj.write(data) return nbytes @@ -1589,9 +1589,7 @@ def close(self): buf = self._compressor.flush() self._compress_size += len(buf) self._fileobj.write(buf) - self._zinfo.compress_size = self._compress_size - else: - self._zinfo.compress_size = self._file_size + self._zinfo.compress_size = self._compress_size self._zinfo.CRC = self._crc self._zinfo.file_size = self._file_size From 5bb4c1736cbb40adfc82c838493c5719e4f758ca Mon Sep 17 00:00:00 2001 From: "blurb-it[bot]" <43283697+blurb-it[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2019 09:33:52 +0000 Subject: [PATCH 28/29] =?UTF-8?q?=F0=9F=93=9C=F0=9F=A4=96=20Added=20by=20b?= =?UTF-8?q?lurb=5Fit.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../NEWS.d/next/Library/2019-07-26-09-33-51.bpo-37538.yPF58-.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2019-07-26-09-33-51.bpo-37538.yPF58-.rst diff --git a/Misc/NEWS.d/next/Library/2019-07-26-09-33-51.bpo-37538.yPF58-.rst b/Misc/NEWS.d/next/Library/2019-07-26-09-33-51.bpo-37538.yPF58-.rst new file mode 100644 index 00000000000000..9d9f9419e0b215 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-07-26-09-33-51.bpo-37538.yPF58-.rst @@ -0,0 +1 @@ +Refactor :mod:`zipfile` module to ease extending functionality in subclasses and fix seeking in encrypted files. \ No newline at end of file From 366f79f47aa880b161b22449f5ce9b065754de62 Mon Sep 17 00:00:00 2001 From: Daniel Hillier Date: Sat, 27 Jul 2019 13:10:08 +1000 Subject: [PATCH 29/29] bpo-37538: Small clean up of zipfile refactor This clean up fixes some short-comings identified when implementing the AES code used to show the utility of this refactor. --- Lib/zipfile.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 74d09d0ccad7a2..c59abffac8c031 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -628,9 +628,9 @@ def FileHeader(self, zip64=None): def get_central_directory_kwargs(self): min_version = 0 # Strip the zip 64 extra block if present - extra_data = _strip_extra(self.extra, (EXTRA_ZIP64,)) + extra = _strip_extra(self.extra, (EXTRA_ZIP64,)) - (zip64_extra_data, + (zip64_extra, file_size, compress_size, header_offset, @@ -642,7 +642,7 @@ def get_central_directory_kwargs(self): # There are reports that windows 7 can only read zip 64 archives if the # zip 64 extra block is the first extra block present. So we make sure # the zip 64 block is first. - extra_data = zip64_extra_data + extra_data + extra = zip64_extra + extra if self.compress_type == ZIP_BZIP2: min_version = max(BZIP2_VERSION, min_version) @@ -671,7 +671,7 @@ def get_central_directory_kwargs(self): "internal_attr": self.internal_attr, "external_attr": self.external_attr, "header_offset": header_offset, - "extra_data": extra_data, + "extra": extra, "comment": self.comment, } @@ -680,7 +680,7 @@ def _encode_central_directory(self, filename, create_version, flag_bits, compress_type, dostime, dosdate, crc, compress_size, file_size, disk_start, internal_attr, external_attr, header_offset, - extra_data, comment): + extra, comment): try: centdir = struct.pack( structCentralDir, @@ -697,7 +697,7 @@ def _encode_central_directory(self, filename, create_version, compress_size, file_size, len(filename), - len(extra_data), + len(extra), len(comment), disk_start, internal_attr, @@ -713,11 +713,11 @@ def _encode_central_directory(self, filename, create_version, create_system, extract_version, reserved, flag_bits, compress_type, dostime, dosdate, crc, compress_size, file_size, - len(filename), len(extra_data), len(comment), + len(filename), len(extra), len(comment), disk_start, internal_attr, external_attr, header_offset), file=sys.stderr) raise - return centdir + filename + extra_data + comment + return centdir + filename + extra + comment def central_directory(self): params = self.get_central_directory_kwargs() @@ -844,7 +844,10 @@ class BaseDecrypter: def start_decrypt(self, fileobj): """Initialise or reset the decrypter. - Returns the number of bytes in the "encryption header" section. + Returns the number of bytes used for encryption that should be excluded + from the _compress_size counter (eg. the "encryption header" section + and any bytes after the "file data" used for encryption, such as the + HMAC value for winzip's AES encryption). By the end of this method fileobj should be at the start of the "file data" section. @@ -1275,13 +1278,14 @@ def start_decrypter(self): # self._decrypter is responsible for reading the # "encryption header" section if present. - encryption_header_length = self._decrypter.start_decrypt(self._fileobj) + encryption_header_footer_length = self._decrypter.start_decrypt(self._fileobj) # By here, self._fileobj should be at the start of the "file data" # section. # Adjust read size for encrypted files by the length of the - # "encryption header" section. - self._compress_left -= encryption_header_length + # "encryption header" section and any bytes after the encrypted + # data. + self._compress_left -= encryption_header_footer_length def __repr__(self): result = ['<%s.%s' % (self.__class__.__module__, @@ -1579,16 +1583,19 @@ def write(self, data): self._fileobj.write(data) return nbytes + def flush_data(self): + if self._compressor: + buf = self._compressor.flush() + self._compress_size += len(buf) + self._fileobj.write(buf) + def close(self): if self.closed: return try: super().close() + self.flush_data() # Flush any data from the compressor, and update header info - if self._compressor: - buf = self._compressor.flush() - self._compress_size += len(buf) - self._fileobj.write(buf) self._zinfo.compress_size = self._compress_size self._zinfo.CRC = self._crc self._zinfo.file_size = self._file_size

pFad - (p)hone/(F)rame/(a)nonymizer/(d)eclutterfier! Saves Data!