Building a large number of points using matplotlib and running out of memory

Question

Building a large number of points using matplotlib and running out of memory

I have a large (~ 6 GB) plain text file

x1 y1 z1 x2 y2 z2 ...

Since I can download this data more than once, I created the np.memmap file for efficiency reasons:

 X,Y,Z = np.memmap(f_np_mmap,dtype='float32',mode='r',shape=shape).T

What I'm trying to do is plot:

 plt.scatter(X, Y, color=custom_colorfunction(Z), alpha=.01, s=.001, marker='s', linewidth=0)

This works great for small datasets. However, I do not have enough memory for this larger dataset. I checked that plt.scatter takes all the memory; I can go through X,Y,Z just fine. Is there a way to “rasterize” the canvas so that I don't have enough memory? I do not need to enlarge and pan the image, it goes directly to the disk. I understand that I can align the data and build it, but I'm not sure how to do it with a custom color palette and alpha value.

+6

python matplotlib large-data

Hooked Nov 27 '13 at 18:52

source share

2 answers

Something like this (sorry for the long code, most of it is copied from stardard axes.Axes.draw ):

 from operator import itemgetter class generator_scatter_axes(matplotlib.axes.Axes): def __init__(self, *args, **kwargs): matplotlib.axes.Axes.__init__(self, *args, **kwargs) self._big_data = None def draw(self, renderer=None, inframe=None): # copied from original draw (so you can still add normal artists ect) if renderer is None: renderer = self._cachedRenderer if renderer is None: raise RuntimeError('No renderer defined') if not self.get_visible(): return renderer.open_group('axes') locator = self.get_axes_locator() if locator: pos = locator(self, renderer) self.apply_aspect(pos) else: self.apply_aspect() artists = [] artists.extend(self.collections) artists.extend(self.patches) artists.extend(self.lines) artists.extend(self.texts) artists.extend(self.artists) if self.axison and not inframe: if self._axisbelow: self.xaxis.set_zorder(0.5) self.yaxis.set_zorder(0.5) else: self.xaxis.set_zorder(2.5) self.yaxis.set_zorder(2.5) artists.extend([self.xaxis, self.yaxis]) if not inframe: artists.append(self.title) artists.append(self._left_title) artists.append(self._right_title) artists.extend(self.tables) if self.legend_ is not None: artists.append(self.legend_) # the frame draws the edges around the axes patch -- we # decouple these so the patch can be in the background and the # frame in the foreground. if self.axison and self._frameon: artists.extend(self.spines.itervalues()) if self.figure.canvas.is_saving(): dsu = [(a.zorder, a) for a in artists] else: dsu = [(a.zorder, a) for a in artists if not a.get_animated()] # add images to dsu if the backend support compositing. # otherwise, does the manaul compositing without adding images to dsu. if len(self.images) <= 1 or renderer.option_image_nocomposite(): dsu.extend([(im.zorder, im) for im in self.images]) _do_composite = False else: _do_composite = True dsu.sort(key=itemgetter(0)) # rasterize artists with negative zorder # if the minimum zorder is negative, start rasterization rasterization_zorder = self._rasterization_zorder if (rasterization_zorder is not None and len(dsu) > 0 and dsu[0][0] < rasterization_zorder): renderer.start_rasterizing() dsu_rasterized = [l for l in dsu if l[0] < rasterization_zorder] dsu = [l for l in dsu if l[0] >= rasterization_zorder] else: dsu_rasterized = [] # the patch draws the background rectangle -- the frame below # will draw the edges if self.axison and self._frameon: self.patch.draw(renderer) if _do_composite: # make a composite image blending alpha # list of (mimage.Image, ox, oy) zorder_images = [(im.zorder, im) for im in self.images if im.get_visible()] zorder_images.sort(key=lambda x: x[0]) mag = renderer.get_image_magnification() ims = [(im.make_image(mag), 0, 0, im.get_alpha()) for z, im in zorder_images] l, b, r, t = self.bbox.extents width = mag * ((round(r) + 0.5) - (round(l) - 0.5)) height = mag * ((round(t) + 0.5) - (round(b) - 0.5)) im = mimage.from_images(height, width, ims) im.is_grayscale = False l, b, w, h = self.bbox.bounds # composite images need special args so they will not # respect z-order for now gc = renderer.new_gc() gc.set_clip_rectangle(self.bbox) gc.set_clip_path(mtransforms.TransformedPath( self.patch.get_path(), self.patch.get_transform())) renderer.draw_image(gc, round(l), round(b), im) gc.restore() if dsu_rasterized: for zorder, a in dsu_rasterized: a.draw(renderer) renderer.stop_rasterizing() for zorder, a in dsu: a.draw(renderer) ############################ # new bits ############################ if self._big_data is not None: for x, y, z in self._big_data: # add the (single point) to the axes a = self.scatter(x, y, color='r', alpha=1, s=10, marker='s', linewidth=0) # add the point, in Agg this will render + composite a.draw(renderer) # remove the artist from the axes, shouldn't let the render know a.remove() # delete the artist for good measure del a ####################### # end new bits ####################### # again, from original to clean up renderer.close_group('axes') self._cachedRenderer = renderer

use it like this:

 In [42]: fig = figure() In [43]: ax = generator_scatter_axes(fig, [.1, .1, .8, .8]) In [44]: fig.add_axes(ax) Out[44]: <__main__.generator_scatter_axes at 0x56fe090> In [45]: ax._big_data = rand(500, 3) In [46]: draw()

I modified your scatter function to have shapes that are visible in small numbers. This will be very slow as you set up a scatter object each time. I would either perceive reasonable fragments of your data, or earn it, or replace the call with scatter with the main artists, or use Joe’s offer and simply update one artist.

+6

tacaswell Nov 27 '13 at 19:48

source share

Joe kington · Accepted Answer · 2013-11-27T20:36:59+0000

Sentence

@tcaswell overriding the Axes.draw method is by far the most flexible way to approach this.

However, you can use / abuse blitting to do this without subclassing Axes . Just use draw_artist every time without restoring the canvas.

There is another trick: we need a special save method, since everyone else draws a canvas before saving, which will destroy everything that we painted earlier.

Also, as tcaswell notes, calling draw_artist for each element is pretty slow, so for a lot of points, you'll want to split your input. Chunking will give significant speedup, but this method will always be slower than drawing one PathCollection .

Anyway, any of these answers should ease your memory problems. Here is a simplified example.

 import numpy as np import matplotlib.pyplot as plt from matplotlib import _png from itertools import izip def main(): # We'll be saving the figure background, so let make it transparent. fig, ax = plt.subplots(facecolor='none') # You'll have to know the extent of the input beforehand with this method. ax.axis([0, 10, 0, 10]) # We need to draw the canvas before we start adding points. fig.canvas.draw() # This won't actually ever be drawn. We just need an artist to update. col = ax.scatter([5], [5], color=[0.1, 0.1, 0.1], alpha=0.3) for xy, color in datastream(int(1e6), chunksize=int(1e4)): col.set_offsets(xy) col.set_color(color) ax.draw_artist(col) save(fig, 'test.png') def datastream(n, chunksize=1): """Returns a generator over "n" random xy positions and rgb colors.""" for _ in xrange(n//chunksize): xy = 10 * np.random.random((chunksize, 2)) color = np.random.random((chunksize, 3)) yield xy, color def save(fig, filename): """We have to work around `fig.canvas.print_png`, etc calling `draw`.""" renderer = fig.canvas.renderer with open(filename, 'w') as outfile: _png.write_png(renderer._renderer.buffer_rgba(), renderer.width, renderer.height, outfile, fig.dpi) main()

In addition, you may notice that the upper and left spikes are pulled. You can get around this by dragging these two spikes ( ax.draw_artist(ax.spines['top']) , etc.) before saving.

Building a large number of points using matplotlib and running out of memory

More articles: