Many dictionaries use a huge amount of RAM

I have a very simple Python script to create (for testing purposes) 35 million dictionary objects in a list. Each dictionary object contains two key / value pairs. eg.

{'Name': 'Jordan', 'Age': 35}

The script very simply takes a query for the name and age, searches the list of dictionaries and returns a new list containing the index of all the relevant entries in the dictionary.

However, as you can see below, an insane amount of memory is consumed. I guess I'm making a naive mistake somewhere.

screenshot of code and task manager show ram usage

My code is as follows: (also can be viewed on the image if it is more readable).

import sys

# Firstly, we will create 35 million records in memory, all will be the same apart from one

def search(key, value, data, age):
    print("Searching, please wait")
    # Create list to store returned PKs
    foundPKS = []
    for index in range(0, len(data)):
        if key in data[index] and 'Age' in data[index]:
            if data[index][key] == value and data[index]['Age'] >= age:
                foundPKS.append(index)
    results = foundPKS
    return results

def createdata():
    # Let create our list for storing our dictionaries
    print("Creating database, please wait")
    dictList = []
    for index in range(0, 35000000):
        # Define dictionary
        record = {'Name': 'Jordan', 'Age': 25}
        if 24500123 <= index <= 24500200:
            record['Name'] = 'Chris'
            record['Age'] = 33
        # Add the dict to a list
        dictList.append(record)
    return dictList

datareturned = createdata()

keyname = input("For which key do you wish to search?")
valuename = input("Which values do you want to find?")
valueage = input("What is the minimum age?")

print("Full data set object size:" + str(sys.getsizeof(datareturned)))
results = search(keyname, valuename, datareturned, int(valueage))

if len(results) > 0:
    print(str(len(results)) + " found. Writing to results.txt")
    fo = open("results.txt", "w")
    for line in range(0, len(results)):
        fo.write(str(results[line]) + "\n")
    fo.close()

What causes mass consumption of RAM?

+4
3

dict . Python , Python 3.5 64bit

In [21]: sys.getsizeof({})
Out[21]: 288

, :

250*36e6*1e-9 == 9.0

, , , list!

dict , , namedtuple.

, , :

In [23]: Record = namedtuple("Record", "name age")

In [24]: records = [Record("john", 28) for _ in range(36000000)]

In [25]: getsizeof = sys.getsizeof

:

In [31]: sum(getsizeof(record)+ getsizeof(record.name) + getsizeof(record.age)  for record in records)
Out[31]: 5220000000

In [32]: _ + getsizeof(records)
Out[32]: 5517842208

In [33]: _ * 1e-9
Out[33]: 5.517842208

, 5 - , . , , int , . python 2,7 ( top).

, , , , - , 10, , int, , int- int , 8- !

In [35]: sum(getsizeof("0123456789") + 8  for record in records)
Out[35]: 2412000000

In [36]: _ + getsizeof(records)
Out[36]: 2709842208

In [37]: _ * 1e-9
Out[37]: 2.709842208

, top.

, ram, Python. array struct, C- . , , numpy , . :

In [1]: import numpy as np

In [2]: recordtype = np.dtype([('name', 'S20'),('age', np.uint8)])

In [3]: records = np.empty((36000000), dtype=recordtype)

In [4]: records.nbytes
Out[4]: 756000000

In [5]: records.nbytes*1e-9
Out[5]: 0.756

, . 8- (, ) . : , . 'S20', 20 . ASCII, 20 ascii.

numpy , C- . , , . , 50 10.

In [8]: for i in range(1, 36000000+1):
   ...:     records['name'][i - 1] = b"%08d" % i
   ...:

In [9]: import random
   ...: for i in range(36000000):
   ...:     records['age'][i] = max(0, int(random.normalvariate(50, 10)))
   ...:

numpy records. , , , np.where:

In [10]: np.where(records['age'] > 70)
Out[10]: (array([      58,      146,      192, ..., 35999635, 35999768, 35999927]),)

In [11]: idx = np.where(records['age'] > 70)[0]

In [12]: len(idx)
Out[12]: 643403

643403 > 70. 100:

In [13]: idx = np.where(records['age'] > 100)[0]

In [14]: len(idx)
Out[14]: 9

In [15]: idx
Out[15]:
array([ 2315458,  5088296,  5161049,  7079762, 15574072, 17995993,
       25665975, 26724665, 28322943])

In [16]: records[idx]
Out[16]:
array([(b'02315459', 101), (b'05088297', 102), (b'05161050', 101),
       (b'07079763', 104), (b'15574073', 101), (b'17995994', 102),
       (b'25665976', 101), (b'26724666', 102), (b'28322944', 101)],
      dtype=[('name', 'S20'), ('age', 'u1')])

, , numpy . . numpy.array - , , . , Python sqlite.

+13

>>> import sys 
>>> sys.getsizeof({'Name': 'Jordan', 'Age': 25}) * 35000000
10080000000

~ 10 . Python , .

. this

+2

... 35 . /. . {'Name': 'Jordan', 'Age': 35}

, .

, . .

__ slots__, ( ):

class Person(object):
    __slots__ = ['Name', 'Age']

s = [Person('Jordan', 35), Person('Martin', 31), Person('Mary', 33)]

, :

s_name = ['Jordan', 'Martin', 'Mary']
s_age = [35, 31, 33]

, interning :

s_name = map(intern, s_name)

Python 3:

s_name = list(map(sys.intern, s_name)
0

Source: https://habr.com/ru/post/1675482/


All Articles