Python 3.1 - Memory error while fetching a large list

The input list may contain more than 1 million numbers. When I run the following code with less "repeats", its fine:

def sample(x):
    length = 1000000 
    new_array = random.sample((list(x)),length)
    return (new_array)

def repeat_sample(x):    
    i = 0
    repeats = 100
    list_of_samples = []
    for i in range(repeats):
       list_of_samples.append(sample(x))
    return(list_of_samples)

repeat_sample(large_array)

However, the use of high repetitions, such as 100 above, results in MemoryError. Traceback is as follows:

Traceback (most recent call last):
  File "C:\Python31\rnd.py", line 221, in <module>
    STORED_REPEAT_SAMPLE = repeat_sample(STORED_ARRAY)
  File "C:\Python31\rnd.py", line 129, in repeat_sample
    list_of_samples.append(sample(x))
  File "C:\Python31\rnd.py", line 121, in sample
    new_array = random.sample((list(x)),length)
  File "C:\Python31\lib\random.py", line 309, in sample
    result = [None] * k
MemoryError

I guess I'm running out of memory. I do not know how to get around this problem.

Thank you for your time!

+3
source share
5 answers

Extension of my comment:

Let's say the processing you do with each sample calculates its average value.

def mean(samplelists):
    means = []
    n = float(len(samplelists[0]))
    for sample in samplelists:
        mean = sum(sample)/n
        means.append(mean)
    return means

calc_means(repeat_sample(large_array))

This will make you sweat, keeping all these lists in mind. You can make it a lot easier:

def mean(sample, n):
    n = float(n)
    mean = sum(sample)/n
    return mean

def sample(x):
    length = 1000000 
    new_array = random.sample(x, length)
    return new_array

def repeat_means(x):    
    repeats = 100
    list_of_means = []
    for i in range(repeats):
        list_of_means.append(mean(sample(x)))
    return list_of_means    

repeat_means(large_array)

. , - :

import random

def sampling_mean(population, k, times):
    # Part of this is lifted straight from random.py
    _int = int
    _random = random.random

    n = len(population)
    kf = float(k)
    result = []

    if not 0 <= k <= n:
        raise ValueError, "sample larger than population"

    for t in range(times):
        selected = set()
        sum_ = 0
        selected_add = selected.add

        for i in xrange(k):
            j = _int(_random() * n)
            while j in selected:
                j = _int(_random() * n)
            selected_add(j)
            sum_ += population[j]

        mean = sum_/kf
        result.append(mean)
    return result

sampling_mean(x, 1000000, 100)

, ?

+5

:

  • , , . MemoryError, , , 32- Python 2 .

  • . .

+4

random.sample() :

from random import random
from math import ceil as _ceil, log as _log

def xsample(population, k):
    """A generator version of random.sample"""
    n = len(population)
    if not 0 <= k <= n:
        raise ValueError("sample larger than population")
    _int = int
    setsize = 21        # size of a small set minus size of an empty list
    if k > 5:
        setsize += 4 ** _ceil(_log(k * 3, 4)) # table size for big sets
    if n <= setsize or hasattr(population, "keys"):
        # An n-length list is smaller than a k-length set, or this is a
        # mapping type so the other algorithm wouldn't work.
        pool = list(population)
        for i in range(k):         # invariant:  non-selected at [0,n-i)
            j = _int(random() * (n-i))
            yield pool[j]
            pool[j] = pool[n-i-1]   # move non-selected item into vacancy
    else:
        try:
            selected = set()
            selected_add = selected.add
            for i in range(k):
                j = _int(random() * n)
                while j in selected:
                    j = _int(random() * n)
                selected_add(j)
                yield population[j]
        except (TypeError, KeyError):   # handle (at least) sets
            if isinstance(population, list):
                raise
            for x in sample(tuple(population), k):
                yield x
+1

, , :

list_of_samples = [random.sample(x, length) for _ in range(repeats)]

, .

0

You can try to use the array object http://docs.python.org/py3k/library/array.html . It should be much more memory efficient than a list, but probably a little harder to use.

0
source

Source: https://habr.com/ru/post/1785751/


All Articles