If you have a list of your data frames dfs:
dfs = [df1, df2, df3, ... , dfn]
you can join them using panda concat, which, as far as I can tell, is faster than the merge chain. concatjoins only data files based on the index (not the column), but with a little preprocessing you can simulate the operation merge.
First, replace the index of each of your data frames in dfswith the column that you want to combine. Let's say you want to join in a column "A":
dfs = [df.set_index("A", drop=True) for df in dfs]
, ( , ), - ( - ).
concat, ( !!)
merged = pd.concat(dfs, axis=1, keys=range(len(dfs)), join='outer', copy=False)
join= 'inner', 'outer' ( ). copy= concat .
"A" , , :
merged.reset_index(drop=False, inplace=True)
keys= ( , , ). . , , 20- dfs, :
merged[20]
keys= , , .
, concat , , merge:
ipython% timeit ( 10, 100 1000 ):
def merge_with_concat(dfs, col):
dfs = [df.set_index(col, drop=True) for df in dfs]
merged = pd.concat(dfs, axis=1, keys=range(len(dfs)), join='outer', copy=False)
return merged
dfs10 = [pd.util.testing.makeDataFrame() for i in range(10)]
dfs100 = [pd.util.testing.makeDataFrame() for i in range(100)]
dfs1000 = [pd.util.testing.makeDataFrame() for i in range(1000)]
%timeit reduce(lambda df1, df2: df1.merge(df2, on="A", how='outer'), dfs10)
10 loops, best of 3: 45.8 ms per loop
%timeit merge_with_concat(dfs10,"A")
100 loops, best of 3: 11.7 ms per loop
%timeit merge_with_concat(dfs100,"A")
10 loops, best of 3: 139 ms per loop
%timeit reduce(lambda df1, df2: df1.merge(df2, on="A", how='outer'), dfs100)
1 loop, best of 3: 1.55 s per loop
%timeit merge_with_concat(dfs1000,"A")
1 loop, best of 3: 9.67 s per loop
%timeit reduce(lambda df1, df2: df1.merge(df2, on="A", how='outer'), dfs1000)