My proof of concept using the lzma library ( backport for python 2 ) with compression in memory. Instead of a memory buffer, you can use a file on disk:
import io
import random
import struct
import sys
from backports import lzma
data = []
for i in xrange(0, 2000):
data += [random.randint(-sys.maxint, sys.maxint)] * random.randint(0, 500)
print('Uncompressed: {}'.format(len(data)))
buff = io.BytesIO()
fmt = 'i'
lzma_writer = lzma.LZMAFile(buff, 'wb')
for i in data:
lzma_writer.write(struct.pack(fmt, i))
lzma_writer.close()
print('Compressed: {}'.format(len(buff.getvalue())))
buff.seek(0)
lzma_reader = lzma.LZMAFile(buff, 'rb')
size_of = struct.calcsize(fmt)
def generate():
r = lzma_reader.read(size_of)
while len(r) != 0:
yield struct.unpack(fmt, r)[0]
r = lzma_reader.read(size_of)
res = list(generate())
print res == data
Result:
Uncompressed: 496225
Compressed: 11568
True
source
share