Getting the three lowest values ​​in a row and returning the names of the corresponding columns

I have two data frames, df and df2, they correspond. Now, based on the first df data frame, I want to get the 3 smallest value in one row and return the corresponding column name (in this case, for example, “X” or “Y” or “Z” or “T”). So I can get a new dataframe df3.

df = pd.DataFrame({
        'X': [21, 2, 43, 44, 56, 67, 7, 38, 29, 130],
        'Y': [101, 220, 330, 140, 250, 10, 207, 320, 420, 50],
        'Z': [20, 128, 136, 144, 312, 10, 82, 63, 42, 12],
        'T': [2, 32, 4, 424, 256, 167, 27, 38, 229, 30]
    }, index=list('ABCDEFGHIJ'))

df2 = pd.DataFrame({
        'X': [0.5, 0.12,0.43, 0.424, 0.65,0.867,0.17,0.938,0.229,0.113],
        'Y': [0.1,2.201,0.33,0.140,0.525,0.31,0.20,0.32,0.420,0.650],
        'Z': [0.20,0.128,0.136,0.2144,0.5312,0.61,0.82,0.363,0.542,0.512],
        'T':[0.52, 0.232,0.34, 0.6424, 0.6256,0.3167,0.527,0.38,0.4229,0.73]
    },index=list('ABCDEFGHIJ'))

Also, I want to get another dataframe df4 that matches df3 in df2, which means that in df the string ['A'] (2,20,21) is the 3 smallest value, so in the string df4 ['A'] I want to get (0,52,0,2,0,5) from df2.

Thank.

+4
3

, DataFrames argsort :

arr = df.values.argsort(1)[:,:3]
print (arr)
[[0 3 1]
 [1 0 3]
 [0 1 3]
 [1 2 3]
 [1 2 0]
 [2 3 1]
 [1 0 3]
 [0 1 3]
 [1 3 0]
 [3 0 2]]

#get values by indices in arr 
b = df2.values[np.arange(len(arr))[:,None], arr]
print (b)
[[ 0.52    0.2     0.5   ]
 [ 0.12    0.232   0.128 ]
 [ 0.34    0.43    0.136 ]
 [ 0.424   0.14    0.2144]
 [ 0.65    0.525   0.6256]
 [ 0.31    0.61    0.867 ]
 [ 0.17    0.527   0.82  ]
 [ 0.38    0.938   0.363 ]
 [ 0.229   0.542   0.4229]
 [ 0.512   0.73    0.65  ]]

DataFrame:

df3 = pd.DataFrame(df.columns[arr])
df3.columns = ['Col{}'.format(x+1) for x in df3.columns]
print (df3)
  Col1 Col2 Col3
0    T    Z    X
1    X    T    Z
2    T    X    Z
3    X    Y    Z
4    X    Y    T
5    Y    Z    X
6    X    T    Z
7    T    X    Z
8    X    Z    T
9    Z    T    Y

df4 = pd.DataFrame(b)
df4.columns = ['Col{}'.format(x+1) for x in df4.columns]
print (df4)
    Col1   Col2    Col3
0  0.520  0.200  0.5000
1  0.120  0.232  0.1280
2  0.340  0.430  0.1360
3  0.424  0.140  0.2144
4  0.650  0.525  0.6256
5  0.310  0.610  0.8670
6  0.170  0.527  0.8200
7  0.380  0.938  0.3630
8  0.229  0.542  0.4229
9  0.512  0.730  0.6500

, timings:

np.random.seed(14)
N = 1000000
df1 = pd.DataFrame(np.random.randint(100, size=(N, 4)), columns=['X','Y','Z','T'])
#print (df1)

df1 = pd.DataFrame(np.random.rand(N, 4), columns=['X','Y','Z','T'])
#print (df1)


def jez():
    arr = df.values.argsort(1)[:,:3]
    b = df2.values[np.arange(len(arr))[:,None], arr]
    df3 = pd.DataFrame(df.columns[arr])
    df3.columns = ['Col{}'.format(x+1) for x in df3.columns]
    df4 = pd.DataFrame(b)
    df4.columns = ['Col{}'.format(x+1) for x in df4.columns]


def pir():
    v = df.values
    a = v.argpartition(3, 1)[:, :3]
    c = df.columns.values[a]
    pd.DataFrame(c, df.index)
    d = df2.values[np.arange(len(df))[:, None], a]
    pd.DataFrame(d, df.index, [1, 2, 3]).add_prefix('Col')

def cᴏʟᴅsᴘᴇᴇᴅ():
    #another solution is wrong
    df3 = df.apply(lambda x: df.columns[np.argsort(x)], 1).iloc[:, :3]
    pd.DataFrame({'Col{}'.format(i + 1) : df2.lookup(df3.index, df3.iloc[:, i]) for i in range(df3.shape[1])}, index=df.index)


print (jez())
print (pir())
print (cᴏʟᴅsᴘᴇᴇᴅ())

In [176]: %timeit (jez())
1000 loops, best of 3: 412 µs per loop

In [177]: %timeit (pir())
1000 loops, best of 3: 425 µs per loop

In [178]: %timeit (cᴏʟᴅsᴘᴇᴇᴅ())
100 loops, best of 3: 3.99 ms per loop
+2

np.argsort .

df3 = df.apply(lambda x: df.columns[np.argsort(x)], 1).iloc[:, :3]
print(df3)

A  T  Z  X
B  X  T  Z
C  T  X  Z
D  X  Y  Z
E  X  Y  T
F  Y  Z  X
G  X  T  Z
H  T  X  Z
I  X  Z  T
J  Z  T  Y

df3 df.columns ( jezrael answer):

df3 = pd.DataFrame(df.columns[df.values.argsort(1)].values[:, :-1], index=df.index)
df3

   0  1  2
A  T  Z  X
B  X  T  Z
C  T  X  Z
D  X  Y  Z
E  X  Y  T
F  Y  Z  X
G  X  T  Z
H  T  X  Z
I  X  Z  T
J  Z  T  Y

df3, df2 df.lookup.

df4 = pd.DataFrame({'Col{}'.format(i + 1) : df2.lookup(df3.index, df3.iloc[:, i])\
                                        for i in range(df3.shape[1])}, index=df.index)
print(df4)

    Col1   Col2    Col3
A  0.520  0.200  0.5000
B  0.120  0.232  0.1280
C  0.340  0.430  0.1360
D  0.424  0.140  0.2144
E  0.650  0.525  0.6256
F  0.310  0.610  0.8670
G  0.170  0.527  0.8200
H  0.380  0.938  0.3630
I  0.229  0.542  0.4229
J  0.512  0.730  0.6500

+2

I would use numpy.argpartitionas it only breaks each line into the bottom kand the rest. Its temporary complexity is O(n)rather than O(nlogn)because of the need not to sort completely.

v = df.values
m = v.shape[1]

a = v.argpartition(3, 1)[:, :3]

c = df.columns.values[a]

We can determine df3based on this.

df3 = pd.DataFrame(c, df.index)

df3

   0  1  2
A  T  Z  X
B  X  T  Z
C  T  X  Z
D  Y  X  Z
E  Y  X  T
F  Y  Z  X
G  X  T  Z
H  X  T  Z
I  X  Z  T
J  Z  T  Y

You can use this for creat df4

d = df2.values[np.arange(len(df))[:, None], a]
df4 = pd.DataFrame(d, df.index, [1, 2, 3]).add_prefix('Col')
df4

    Col1   Col2    Col3
A  0.520  0.200  0.5000
B  0.120  0.232  0.1280
C  0.340  0.430  0.1360
D  0.140  0.424  0.2144
E  0.525  0.650  0.6256
F  0.310  0.610  0.8670
G  0.170  0.527  0.8200
H  0.938  0.380  0.3630
I  0.229  0.542  0.4229
J  0.512  0.730  0.6500
+2
source

Source: https://habr.com/ru/post/1685063/


All Articles