Saving a large list in python

Question

Saving a large list in python

I need to maintain a large list of python pickleable objects. The list is too large to be stored in RAM, so the \ paging database mechanism is required. I need the mechanism to support quick access for nearby (nearby) areas in the list.

All python list functions should be implemented in the list, but most of the time I will work sequentially: scan a certain range in the list and during the scan decide whether I want to insert \ pop some nodes at the scan point.

The list can be very large (2-3 GB) and should not be fully included in RAM immediately. Nodes are small (100-200 bytes), but may contain various types of data.

A good solution to this would be to use BTree, where only the last available buckets are loaded into RAM.

Using SQL tables is not very good, since I need to implement a complex index key mechanism. My data is not a table, its a simple python list, with the ability to add items to specific indexes and popping items from specific positions.

I tried ZODB and zc.blist , which implement a BTree-based list, which can be saved in the ZODB database file, but I don’t know how to configure it so that the above functions are executed in a reasonable amount of time. I do not need all multi-threaded \ transactional functions. No one else touches the database file except my single-threaded program.

- , ZODB\zc.blist, ?

- , :

import time
import random

NODE_JUMP = 50000
NODE_ACCESS = 10000

print 'STARTING'


random_bytes = open('/dev/urandom', 'rb')

my_list = list()

nodes_no = 0

while True:
    nodes_no += NODE_JUMP
    start = time.time()
    my_list.extend(random_bytes.read(100) for i in xrange(NODE_JUMP))
    print 'extending to %s nodes took %.2f seconds' % (nodes_no, time.time() - start)

    section_start = random.randint(0, nodes_no -NODE_ACCESS -1)
    start = time.time()
    for index in xrange(section_start, section_start + NODE_ACCESS):
        # rotate the string
        my_list[index] = my_list[index][1:] + my_list[index][0]

    print 'access to %s nodes took %.2f seconds' % (NODE_ACCESS, time.time() - start,)

:

extending to 5000000 nodes took 3.49 seconds
access to 10000 nodes took 0.02 seconds
extending to 5050000 nodes took 3.98 seconds
access to 10000 nodes took 0.01 seconds
extending to 5100000 nodes took 2.54 seconds
access to 10000 nodes took 0.01 seconds
extending to 5150000 nodes took 2.19 seconds
access to 10000 nodes took 0.11 seconds
extending to 5200000 nodes took 2.49 seconds
access to 10000 nodes took 0.01 seconds
extending to 5250000 nodes took 3.13 seconds
access to 10000 nodes took 0.05 seconds
Killed (not by me)

+3

python database zodb

Oren 24 . '10 17:59

3

, ZODB - . , .

, , , BTree .

import random
from collections import deque

import ZODB
from ZODB.FileStorage import FileStorage
from ZODB.DB import DB
import transaction
import persistent
import BTrees

def random_string(n=100):
    return ''.join([chr(random.randint(0,95)+32) for i in xrange(n)]) 


class Node(persistent.Persistent):
   def __init__(self, value=None):
       if not value:
           self.value =  random_string()

   def setNeighbors(self, refs):
       self.p1 = refs[0]
       self.p2 = refs[1]
       self.p3 = refs[2]
       self.p4 = refs[3]


def getTree():
    storage = FileStorage('c:\\test.zdb')
    db = DB(storage)
    connection = db.open()
    root = connection.root()
    if root.has_key('tree'):
        tree = root['tree']
    else:
        tree = BTrees.OOBTree.OOBTree()
        root['tree'] = tree
        transaction.commit()
    return tree


def fillDB():
    tree = getTree()

    # start with some initial objects.
    nodes = deque([Node(), Node(), Node(), Node()])
    node = Node()

    for n in xrange(20000):
        tree[n] = node           # store the node based on a numeric ID
        node.setNeighbors(nodes) # Make the node refer to more nodes.
        node = nodes.popleft()   # maintain out list of 4 upcoming nodes.
        nodes.append(Node())
        if n % 1000 == 0:
            transaction.commit() # Must commit for data to make it to disk.
            print n
    transaction.commit()
    return tree

tree . , tree.keys(min, max), API ZODB BTrees.

10 , root, ZODB. root "" ZODB.

ZODB , Btree. :

tree = getTree()

node1 = tree[1]
print node1.p1.p1.p1.p1.p1.p1.p1.p1.p1.value

0

Joe Koberg 24 . '10 20:07

? , , , . http://1978th.net/tokyocabinet/

0

John 24 . '10 21:54

Oren · Accepted Answer · 2010-03-27T00:21:36+0000

zc.blist , "cache_size" , . , "transaction.commit" . cache_size transaction.commit, blist , , RAM .

, , .

. "top" cache_size, , .

import time
import os
import glob
from ZODB import DB
from ZODB.FileStorage import FileStorage
import transaction
from zc.blist import BList

print('STARTING')

random = open('/dev/urandom', 'rb')


def test_list(my_list, loops = 1000, element_size = 100):
    print('testing list')
    start = time.time()
    for loop in xrange(loops):
        my_list.append(random.read(element_size))
    print('appending %s elements took %.4f seconds' % (loops, time.time() - start))

    start = time.time()
    length = len(my_list)
    print('length calculated in %.4f seconds' % (time.time() - start,))

    start = time.time()
    for loop in xrange(loops):
        my_list.insert(length / 2, random.read(element_size))
    print('inserting %s elements took %.4f seconds' % (loops, time.time() - start))

    start = time.time()
    for loop in xrange(loops):
        my_list[loop] = my_list[loop][1:] + my_list[loop][0]
    print('modifying %s elements took %.4f seconds' % (loops, time.time() - start))

    start = time.time()
    for loop in xrange(loops):
        del my_list[0]
    print('removing %s elements took %.4f seconds' % (loops, time.time() - start))

    start = time.time()
    transaction.commit()
    print('committing all above took %.4f seconds' % (time.time() - start,))

    del my_list[:loops]
    transaction.commit()

    start = time.time()
    pack()
    print('packing after removing %s elements took %.4f seconds' % (loops, time.time() - start))

for filename in glob.glob('database.db*'):    
    try:
        os.unlink(filename)
    except OSError:
        pass

db = DB(FileStorage('database.db'),
        cache_size = 2000)

def pack():
    db.pack()

root = db.open().root()

root['my_list'] = BList()

print('inserting initial data to blist')

for loop in xrange(10):
    root['my_list'].extend(random.read(100) for x in xrange(100000))
    transaction.commit()

transaction.commit()

test_list(root['my_list'])

Saving a large list in python

More articles: