Joining Pandas Dataframes for Generic Indexes

I have 3 DataFramesthat have different numbers of common indices. For example:

>>> df0=pd.DataFrame(index=pd.MultiIndex.from_product([[1,2,3,4],[2011,2012],['A','B']], names=['Season','Year','Location']))
>>> df0['Value0']=np.random.randint(1,100,len(df0))
>>> 
>>> df1=pd.DataFrame(index=pd.MultiIndex.from_product([[2011,2012],['A','B']], names=['Year','Location']))
>>> df1['Value1']=np.random.randint(1,100,len(df1))
>>> 
>>> df2=pd.DataFrame(index=['A','B'])
>>> df2.index.name='Location'
>>> df2['Value2']=np.random.randint(1,100,len(df2))
>>> df0
                      Value0
Season Year Location        
1      2011 A             18
            B             63
       2012 A             88
            B             30
2      2011 A             35
            B             60
       2012 A             61
            B              4
3      2011 A             70
            B              9
       2012 A             11
            B             38
4      2011 A             68
            B             57
       2012 A             13
            B             35
>>> df1
               Value1
Year Location        
2011 A             22
     B             74
2012 A             73
     B             44
>>> df2
          Value2
Location        
A             70
B             85
>>> 

I am looking for the best way to join them by their common indexes.

Things I tried:

1) pd.concat([df0,df1,df2],1)it would be nice because it accepts a list of data frames, but it seems to work only if the number frames have the same number of indices.

2) Combining one of the multi-index data frames with a single index DataFrameworks: df1.join(df2)or df0.join(df2). However, joining DataFramewith two indexes with DataFramethree indexes does not matter: df0.join(df1)and gives me the following error: "NotImplementedError: does not merge with overlapping more than one level in a multi-index"

, , - reset pd.merge(). . :

def JoinMulti(DFList):
    FinalDF=DFList[0].reset_index()
    for OtherDF in DFList[1:]:
        FinalDF=pd.merge(FinalDF, OtherDF.reset_index(), how='outer')

    #Now I want reindex it so that it indexed the same as the `DataFrame` with the highest number of levels
    NLevels=[x.index.nlevels for x in DFList]
    MaxIndexPos=NLevels.index(max(NLevels))
    FinalIndex=DFList[MaxIndexPos].index
    FinalDF=FinalDF.set_index(FinalIndex.names).reindex(FinalIndex)
    return FinalDF

>>> JoinMulti([df0,df1,df2])
                      Value0  Value1  Value2
Season Year Location                        
1      2011 A             43       5      96
            B             63      46      97
       2012 A             68       6      96
            B             23      99      97
2      2011 A             66       5      96
            B             30      46      97
       2012 A             45       6      96
            B             79      99      97
3      2011 A             66       5      96
            B             21      46      97
       2012 A             86       6      96
            B             11      99      97
4      2011 A             95       5      96
            B             58      46      97
       2012 A             32       6      96
            B             80      99      97
>>> 

? - , , , ?

+4
1

:

def jez(df0,df1,df2):
    df1 = df1.join(df2)
    df0 = df0.reset_index('Season')
    FinalDF = df0.join(df1).set_index('Season', append=True).reorder_levels(['Season', 'Year', 'Location']).sortlevel()
    return FinalDF

print jez(df0,df1,df2)

Timing:

In [41]: %timeit jez(df0,df1,df2)
The slowest run took 4.14 times longer than the fastest. This could mean that an intermediate result is being cached 
100 loops, best of 3: 5.02 ms per loop

In [42]: %timeit JoinMulti([df0,df1,df2])
100 loops, best of 3: 9.83 ms per loop
0

Source: https://habr.com/ru/post/1618473/


All Articles