What is the fastest way to ensure that a particular column is the last (or first) in a data frame

Question

What is the fastest way to ensure that a particular column is the last (or first) in a data frame

the df

df = pd.DataFrame(np.arange(8).reshape(2, 4), columns=list('abcd'))

Suppose I need the column to 'b'be at the end. I could do:

df[['a', 'c', 'd', 'b']]

But what is the most efficient way to ensure that a given column is at the end?

This is what I walked with. What could others do?

def put_me_last(df, column):
    return pd.concat([df.drop(column, axis=1), df[column]], axis=1)

put_me_last(df, 'b')

Sync Results

conclusion The winner is mfripp. It seems to be reindex_axisa large gain over efficiency []. This is really good information.

the code

from string import lowercase

df_small = pd.DataFrame(np.arange(8).reshape(2, 4), columns=list('abcd'))
df_large = pd.DataFrame(np.arange(1000000).reshape(10000, 100),
                        columns=pd.MultiIndex.from_product([list(lowercase[:-1]), ['One', 'Two', 'Three', 'Four']]))


def pir1(df, column):
    return pd.concat([df.drop(column, axis=1), df[column]], axis=1)

def pir2(df, column):
    if df.columns[-1] == column:
        return df
    else:
        pos = df.columns.values.__eq__('b').argmax()
        return df[np.roll(df.columns, len(df.columns) - 1 - pos)]

def pir3(df, column):
    if df.columns[-1] == column:
        return df
    else:
        pos = df.columns.values.__eq__('b').argmax()
        cols = df.columns.values
        np.concatenate([cols[:pos], cols[1+pos:], cols[[pos]]])
        return df[np.concatenate([cols[:pos], cols[1+pos:], cols[[pos]]])]

def pir4(df, column):
    if df.columns[-1] == column:
        return df
    else:
        return df[np.roll(df.columns.drop(column).insert(0, column), -1)]

def carsten1(df, column):
    cols = list(df)
    if cols[-1] == column:
        return df
    else:
        return pd.concat([df.drop(column, axis=1), df[column]], axis=1)

def carsten2(df, column):
    cols = list(df)
    if cols[-1] == column:
        return df
    else:
        idx = cols.index(column)
        new_cols = cols[:idx] + cols[idx + 1:] + [column]
        return df[new_cols]

def mfripp1(df, column):
    new_cols = [c for c in df.columns if c != column] + [column]
    return df[new_cols]

def mfripp2(df, column):
    new_cols = [c for c in df.columns if c != column] + [column]
    return df.reindex_axis(new_cols, axis='columns', copy=False)

def ptrj1(df, column):
    return df.reindex(columns=df.columns.drop(column).append(pd.Index([column])))

def shivsn1(df, column):
    column_list=list(df)
    column_list.remove(column)
    column_list.append(column)
    return df[column_list]

def merlin1(df, column):
    return df[df.columns.drop(["b"]).insert(99999, 'b')]


list_of_funcs = [pir1, pir2, pir3, pir4, carsten1, carsten2, mfripp1, mfripp2, ptrj1, shivsn1]

def test_pml(df, pml):
    for c in df.columns:
        pml(df, c)

summary = pd.DataFrame([], [f.__name__ for f in list_of_funcs], ['Small', 'Large'])

for f in list_of_funcs:
    summary.at[f.__name__, 'Small'] = timeit(lambda: test_pml(df_small, f), number=100)
    summary.at[f.__name__, 'Large'] = timeit(lambda: test_pml(df_large, f), number=10)

+2

python pandas

piRSquared Jul 27 '16 at 0:18

source share

5 answers

, (, , ) , , DataSet. , , , df . :

def put_me_last2(df, column):
    if list(df)[-1] == column:
        return df
    else: return pd.concat([df.drop(column, axis=1), df[column]], axis=1)

8 8 , , , b , 300 (500us 150 ), , d (.. ).

, , , , .

Update:

: , df[cols] . 40% (90 150 8 ).

def put_me_last3(df, column):
    cols = list(df)
    if cols[-1] == column:
        return df
    else:
        idx = cols.index(column)
        new_cols = cols[:idx] + cols[idx + 1:] + [column]
        return df[new_cols]

+3

Carsten 27 . '16 0:37

:

df.reindex(columns=df.columns.drop(col).append(pd.Index([col])))

(.append([col]) - . : .append(pd.Index([col]), , append.)

Test comment: if you plan to test with help timeit, try running it on a large df (e.g. 1-4 lines or more) and possibly using -n1 -r1it to prevent caching.

+2

ptrj Jul 27 '16 at 0:59

source share

This is not the fastest, though:

def put_me_last(df,column):
    column_list=list(df)
    column_list.remove(column)
    column_list.append(column)
    return df[column_list]  



%timeit put_me_last(df,'b')
1000 loops, best of 3: 391 µs per loop

+1

shivsn Jul 27 '16 at 6:20

source share

Starting from this:

 df.columns
 Index([u'a', u'b', u'c', u'd'], dtype='object')

Do not do this, it looks like an error.

 df.columns.drop(["b"]).insert(-1, 'b')
 Index([u'a', u'c', u'b', u'd'], dtype='object')

 df.columns.drop(["b"]).insert(-1, 'x')
 Index([u'a', u'c', u'x', u'd'], dtype='object')

WORK AROUND:

 df.columns.drop(["b"]).insert(99999, 'b')
 Index([u'a', u'c', u'd', u'b'], dtype='object')

0

Merlin Jul 27 '16 at 2:12

source share

Matthias Fripp · Accepted Answer · 2016-07-27T00:57:32+0000

I would modify the list of columns rather than deleting and adding one of them:

import pandas as pd
import numpy as np

df = pd.DataFrame(np.arange(8).reshape(2, 4), columns=list('abcd'))

def put_me_last(df, column):
    return pd.concat([df.drop(column, axis=1), df[column]], axis=1)

def put_me_last_fast(df, column):
    new_cols = [c for c in df.columns if c != column] + [column]
    return df[new_cols]

def put_me_last_faster(df, column):
    new_cols = [c for c in df.columns if c != column] + [column]
    return df.reindex_axis(new_cols, axis='columns', copy=False)

Timing (in iPython):

%timeit put_me_last(df, 'b')
# 1000 loops, best of 3: 741 µs per loop

%timeit put_me_last_fast(df, 'b')
# 1000 loops, best of 3: 295 µs per loop

%timeit put_me_last_faster(df, 'b')
# 1000 loops, best of 3: 239 µs per loop

%timeit put_me_last_faster(df, 'd')  # not changing order
# 1000 loops, best of 3: 125 µs per loop

: new_cols, 80x , (2 160 )

new_cols = df.columns.drop(column).insert(-1, column)

: , , 1 , , @Carsten:

if df.columns[-1] == column:
    return df

What is the fastest way to ensure that a particular column is the last (or first) in a data frame

Sync Results

More articles: