The @steboc comment mentions using sqlite as a possible solution. You can use any database as a backend for this, but sqlite is fast enough and requires almost zero configuration. Here is an example of writing garbage heap in sqlite and then reading it in groups:
Start by downloading several packages and setting up your environment:
import pandas as pd
import sqlite3
import string
conn = sqlite3.connect('example.db')
c = conn.cursor()
np.random.seed(123)
n = 1000000
c = 10
30 , dataframe 1- , 10 , . sqlite. 30 . 15 MBP:
%%time
for i in arange(30):
df = pd.DataFrame(np.random.randn(n, c), columns=list(map(chr, range(65, 65+c))))
df['key'] = string.ascii_letters[i]
df.to_sql(name='test_table', if_exists='append', con=conn)
, key, . :
%%time
keys_df = pd.read_sql(sql='SELECT DISTINCT key FROM test_table', con=conn)
keys_df
keys_df, , . . ():
%%time
for row in keys_df.iterrows():
tempdf = pd.read_sql(sql='SELECT * FROM test_table WHERE key=\'' + row[1][0] + '\';', con=conn)
print tempdf.describe()
print ""
, .
, , sqlite pandas .