PyMOTW: zlib

By Doug Hellmann
December 28, 2008

zlib – Low-level access to GNU zlib compression library

Purpose:Low-level access to GNU zlib compression library
Python Version:2.5 and later

The zlib module provides a lower-level interface to many of the functions in the zlib compression library from GNU.

Working with Data in Memory

The simplest way to work with zlib requires holding all of the data to be compressed or decompressed in memory, and then using compress() and decompress().

import zlib
import binascii

original_data = 'This is the original text.'
print 'Original :', len(original_data), original_data

compressed = zlib.compress(original_data)
print 'Compressed :', len(compressed), binascii.hexlify(compressed)

decompressed = zlib.decompress(compressed)
print 'Decompressed :', len(decompressed), decompressed

$ python zlib_memory.py
Original : 26 This is the original text.
Compressed : 32 789c0bc9c82c5600a2928c5485fca2ccf4ccbcc41c8592d48a123d007f2f097e
Decompressed : 26 This is the original text.

Notice that for short text, the compressed version can be longer. While the actual results depend on the input data, for short bits of text it is interesting to observe the compression overhead.

import zlib

original_data = 'This is the original text.'

fmt = '%15s %15s'
print fmt % ('len(data)', 'len(compressed)')
print fmt % ('-' * 15, '-' * 15)

for i in xrange(20):
data = original_data * i
compressed = zlib.compress(data)
print fmt % (len(data), len(compressed)), '*' if len(data) < len(compressed) else ''

$ python zlib_lengths.py
len(data) len(compressed)
--------------- ---------------
0 8 *
26 32 *
52 35
78 35
104 36
130 36
156 36
182 36
208 36
234 36
260 36
286 36
312 37
338 37
364 38
390 38
416 38
442 38
468 38
494 38

Working with Streams

The in-memory approach has obvious drawbacks that make it impractical for real-world use cases. The alternative is to use Compress and Decompress objects to manipulate streams of data, so that the entire data set does not have to fit into memory.

The simple server below responds to requests consisting of filenames by writing a compressed version of the file to the socket used to communicate with the client. It has some artificial chunking in place to illustrate the buffering behavior that happens when the data passed to compress() or decompress() doesn’t result in a complete block of compressed or uncompressed output.

Warning

This server has obvious security implications. Do not run it on a server on the
open internet or in any environment where security might be an issue.

from __future__ import with_statement
import zlib
import logging
import SocketServer
import binascii

BLOCK_SIZE = 64

class ZlibRequestHandler(SocketServer.BaseRequestHandler):

logger = logging.getLogger('Server')

def handle(self):
compressor = zlib.compressobj(1)

# Find out what file the client wants
filename = self.request.recv(1024)
self.logger.debug('client asked for: "%s"', filename)

# Send chunks of the file as they are compressed
with open(filename, 'rb') as input:
while True:
block = input.read(BLOCK_SIZE)
if not block:
break
self.logger.debug('RAW "%s"', block)
compressed = compressor.compress(block)
if compressed:
self.logger.debug('SENDING "%s"', binascii.hexlify(compressed))
self.request.send(compressed)
else:
self.logger.debug('BUFFERING')

# Send any data being buffered by the compressor
remaining = compressor.flush()
while remaining:
to_send = remaining[:BLOCK_SIZE]
remaining = remaining[BLOCK_SIZE:]
self.logger.debug('FLUSHING "%s"', binascii.hexlify(to_send))
self.request.send(to_send)
return


if __name__ == '__main__':
import socket
import threading
from cStringIO import StringIO

logging.basicConfig(level=logging.DEBUG,
format='%(name)s: %(message)s',
)
logger = logging.getLogger('Client')

# Set up a server, running in a separate thread
address = ('localhost', 0) # let the kernel give us a port
server = SocketServer.TCPServer(address, ZlibRequestHandler)
ip, port = server.server_address # find out what port we were given

t = threading.Thread(target=server.serve_forever)
t.setDaemon(True)
t.start()

# Connect to the server
logger.info('Contacting server on %s:%s', ip, port)
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((ip, port))

# Ask for a file
requested_file = 'lorem.txt'
logger.debug('sending filename: "%s"', requested_file)
len_sent = s.send(requested_file)

# Receive a response
buffer = StringIO()
decompressor = zlib.decompressobj()
while True:
response = s.recv(BLOCK_SIZE)
if not response:
break
logger.debug('READ "%s"', binascii.hexlify(response))

# Include any unconsumed data when feeding the decompressor.
to_decompress = decompressor.unconsumed_tail + response
while to_decompress:
decompressed = decompressor.decompress(to_decompress)
if decompressed:
logger.debug('DECOMPRESSED "%s"', decompressed)
buffer.write(decompressed)
# Look for unconsumed data due to buffer overflow
to_decompress = decompressor.unconsumed_tail
else:
logger.debug('BUFFERING')
to_decompress = None

# deal with data reamining inside the decompressor buffer
remainder = decompressor.flush()
if remainder:
logger.debug('FLUSHED "%s"', remainder)
buffer.write(reaminder)

full_response = buffer.getvalue()
lorem = open('lorem.txt', 'rt').read()
logger.debug('response matches file contents: %s', full_response == lorem)

# Clean up
s.close()
server.socket.close()

$ python zlib_server.py
Client: Contacting server on 127.0.0.1:51426
Client: sending filename: "lorem.txt"
Server: client asked for: "lorem.txt"
Server: RAW "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Donec
"
Server: SENDING "7801"
Server: RAW "egestas, enim et consectetuer ullamcorper, lectus ligula rutrum "
Client: READ "7801"
Client: BUFFERING
Server: BUFFERING
Server: RAW "leo, a
elementum elit tortor eu quam. Duis tincidunt nisi ut ant"
Server: BUFFERING
Server: RAW "e. Nulla
facilisi. Sed tristique eros eu libero. Pellentesque ve"
Server: BUFFERING
Server: RAW "l arcu. Vivamus
purus orci, iaculis ac, suscipit sit amet, pulvi"
Server: BUFFERING
Server: RAW "nar eu,
lacus. Praesent placerat tortor sed nisl. Nunc blandit d"
Server: BUFFERING
Server: RAW "iam egestas
dui. Pellentesque habitant morbi tristique senectus "
Server: BUFFERING
Server: RAW "et netus et
malesuada fames ac turpis egestas. Aliquam viverra f"
Server: BUFFERING
Server: RAW "ringilla
leo. Nulla feugiat augue eleifend nulla. Vivamus mauris"
Server: BUFFERING
Server: RAW ". Vivamus sed
mauris in nibh placerat egestas. Suspendisse poten"
Server: BUFFERING
Server: RAW "ti. Mauris massa. Ut
eget velit auctor tortor blandit sollicitud"
Server: BUFFERING
Server: RAW "in. Suspendisse imperdiet
justo.
"
Server: BUFFERING
Server: FLUSHING "5592418edb300c45f73e050f60f80e05ba6c8b0245bb676426c382923c22e9f3f70bc94c1ac00b9b963eff7fe4b73ea4921e9e95f66e7d906b105789954a6f2e"
Server: FLUSHING "25245206f1ae877ad17623318d8dbef62665919b78b0af244d2b49bc5e4a33aea58f43c64a06ad7432bda5318d8c819e267d255ec4a44a0b14a638451f784892"
Client: READ "5592418edb300c45f73e050f60f80e05ba6c8b0245bb676426c382923c22e9f3f70bc94c1ac00b9b963eff7fe4b73ea4921e9e95f66e7d906b105789954a6f2e"
Server: FLUSHING "de932b7aa53a85b6a27bb6a0a6ae94b0d94236fa31bb2c572e6aa86ff44b768aa11efa9e4232ba4f21d30b5e37fa2966e8243e7f9e62c4a3e4467ff4e49abe1c"
Server: FLUSHING "39e0b18fa22b299784247159c913d90f587be239d24e6d3c6dae8be1ac437db038e4e94041067f467198826d9b765ba18b71dba1b62b23f29de1b227dcbff87b"
Client: DECOMPRESSED "Lorem ipsum dolor sit amet, conse"
Server: FLUSHING "e38b065252ede3a2ffa5428f3b4d106f181022c652d9c49377a62b06387d53e4c0d43e3a6cf4c500052d4f3d650c1c1c18a84e7e18c403255d256f0aeb9cb709"
Client: READ "25245206f1ae877ad17623318d8dbef62665919b78b0af244d2b49bc5e4a33aea58f43c64a06ad7432bda5318d8c819e267d255ec4a44a0b14a638451f784892"
Client: DECOMPRESSED "ctetuer adipiscing elit. Donec
egestas, enim et consectetuer ullamcorper, lectus ligula rutrum leo, a
elementum elit tortor eu"
Server: FLUSHING "d044afd2607f72fe24459513909fdf480807b346da90f5f2f684f04888d9a41fd05277a1a3074821f2f7fbadcaeed0ff1d73a962ce666e6296b9098f85f8c0e6"
Client: READ "de932b7aa53a85b6a27bb6a0a6ae94b0d94236fa31bb2c572e6aa86ff44b768aa11efa9e4232ba4f21d30b5e37fa2966e8243e7f9e62c4a3e4467ff4e49abe1c"
Server: FLUSHING "dd4c8b46eeda5e45b562d776058dbfe9d1b7e51f6f370ea5"
Client: DECOMPRESSED " quam. Duis tincidunt nisi ut ante. Nulla
facilisi. Sed tristique eros eu libero. Pellentesque vel arcu. Vivamus
p"
Client: READ "39e0b18fa22b299784247159c913d90f587be239d24e6d3c6dae8be1ac437db038e4e94041067f467198826d9b765ba18b71dba1b62b23f29de1b227dcbff87b"
Client: DECOMPRESSED "urus orci, iaculis ac, suscipit sit amet, pulvinar eu,
lacus. Praesent placerat tortor sed nisl. Nunc blandit diam egestas
dui. Pellentesque "
Client: READ "e38b065252ede3a2ffa5428f3b4d106f181022c652d9c49377a62b06387d53e4c0d43e3a6cf4c500052d4f3d650c1c1c18a84e7e18c403255d256f0aeb9cb709"
Client: DECOMPRESSED "habitant morbi tristique senectus et netus et
malesuada fames ac turpis egestas. Aliquam viverra fringilla
leo. Nulla feugiat aug"
Client: READ "d044afd2607f72fe24459513909fdf480807b346da90f5f2f684f04888d9a41fd05277a1a3074821f2f7fbadcaeed0ff1d73a962ce666e6296b9098f85f8c0e6"
Client: DECOMPRESSED "ue eleifend nulla. Vivamus mauris. Vivamus sed
mauris in nibh placerat egestas. Suspendisse potenti. Mauris massa. Ut
eget velit auctor tortor blandit s"
Client: READ "dd4c8b46eeda5e45b562d776058dbfe9d1b7e51f6f370ea5"
Client: DECOMPRESSED "ollicitudin. Suspendisse imperdiet
justo.
"
Client: response matches file contents: True

Mixed Content Streams

The Decompress class returned by decompressobj() can also be used in situations where compressed and uncompressed data is mixed together. After decompressing all of the data, the unused_data attribute contains any data not used.

import zlib

lorem = open('lorem.txt', 'rt').read()
compressed = zlib.compress(lorem)
combined = compressed + lorem

decompressor = zlib.decompressobj()
decompressed = decompressor.decompress(combined)

print 'Decompressed matches lorem:', decompressed == lorem
print 'Unused data matches lorem :', decompressor.unused_data == lorem

$ python zlib_mixed.py
Decompressed matches lorem: True
Unused data matches lorem : True

Checksums

In addition to compression and decompression functions, zlib includes two functions for computing checksums of data, adler32() and crc32(). Neither checksum is billed as cryptographically secure, and they are only intended for use for data integrity verification.

Both functions take the same arguments, a string of data and an optional value to be used as a starting point for the checksum. They return a 32-bit signed integer value which can also be passed back on subsequent calls as a new starting point argument to produce a running checksum.

import zlib

data = open('lorem.txt', 'r').read()

cksum = zlib.adler32(data)
print 'Adler32: %12d' % cksum
print ' : %12d' % zlib.adler32(data, cksum)

cksum = zlib.crc32(data)
print 'CRC-32 : %12d' % cksum
print ' : %12d' % zlib.crc32(data, cksum)

$ python zlib_checksums.py
Adler32: 1865879205
: 118955337
CRC-32 : 1878123957
: -1940264325

The Adler32 algorithm is said to be faster than a standard CRC, but I found it to be slower in some tests.

import timeit

iterations = 1000000

def show_results(title, result, iterations):
"Print results in terms of microseconds per pass and per item."
per_pass = 1000000 * (result / iterations)
print '%s:\t%.2f usec/pass' % (title, per_pass)


adler32 = timeit.Timer(
stmt="zlib.adler32(data)",
setup="import zlib; data=open('lorem.txt','r').read() * 10",
)
show_results('Adler32, separate', adler32.timeit(iterations), iterations)

adler32_running = timeit.Timer(
stmt="cksum = zlib.adler32(data, cksum)",
setup="import zlib; data=open('lorem.txt','r').read() * 10; cksum = zlib.adler32(data)",
)
show_results('Adler32, running', adler32_running.timeit(iterations), iterations)

crc32 = timeit.Timer(
stmt="zlib.crc32(data)",
setup="import zlib; data=open('lorem.txt','r').read() * 10",
)
show_results('CRC-32, separate', crc32.timeit(iterations), iterations)

crc32_running = timeit.Timer(
stmt="cksum = zlib.crc32(data, cksum)",
setup="import zlib; data=open('lorem.txt','r').read() * 10; cksum = zlib.crc32(data)",
)
show_results('CRC-32, running', crc32_running.timeit(iterations), iterations)

$ python zlib_checksum_tests.py
Adler32, separate: 30.67 usec/pass
Adler32, running: 30.43 usec/pass
CRC-32, separate: 22.94 usec/pass
CRC-32, running: 23.79 usec/pass

See also

zlib
The standard library documentation for this module.
gzip
The gzip module includes a higher level (file-based) interface to the zlib library.
http://www.zlib.net/
Home page for zlib library.
http://www.zlib.net/manual.html
Complete zlib documentation.
bz2
The bz2 module provides a similar interface to the bzip2 compression library.

PyMOTW Home


You might also be interested in:

News Topics

Recommended for You

Got a Question?