Refine data from layered Excel file via pandas

I want to create neat data from an Excel file that looks like this: three levels of “merged” headers:

enter image description here

Pandas reads the file just fine, with multi-level headers:

# df = pandas.read_excel('test.xlsx', header=[0,1,2])

For repeatability, you can copy-paste this:

df = pandas.DataFrame({('Unnamed: 0_level_0', 'Unnamed: 0_level_1', 'a'): {1: 'aX', 2: 'aY'}, ('Unnamed: 1_level_0', 'Unnamed: 1_level_1', 'b'): {1: 'bX', 2: 'bY'}, ('Unnamed: 2_level_0', 'Unnamed: 2_level_1', 'c'): {1: 'cX', 2: 'cY'}, ('level1_1', 'level2_1', 'level3_1'): {1: 1, 2: 10}, ('level1_1', 'level2_1', 'level3_2'): {1: 2, 2: 20}, ('level1_1', 'level2_2', 'level3_1'): {1: 3, 2: 30}, ('level1_1', 'level2_2', 'level3_2'): {1: 4, 2: 40}, ('level1_2', 'level2_1', 'level3_1'): {1: 5, 2: 50}, ('level1_2', 'level2_1', 'level3_2'): {1: 6, 2: 60}, ('level1_2', 'level2_2', 'level3_1'): {1: 7, 2: 70}, ('level1_2', 'level2_2', 'level3_2'): {1: 8, 2: 80}})

I want to normalize this so that the level headers are in variable rows, but keep columns a, b and c as columns:

Desired Conclusion

Without multi-level headings, I would do pandas.melt(df, id_vars=['a', 'b', 'c'])to get what I want. pandas.melt(df)gives me the three columns of the variable that I want, but obviously does not save the columns a, b, and c.

+4
source share
2 answers

It should be simple:

wide_df = pandas.read_excel(xlfile, sheetname, header=[0, 1, 2], index_col=[0, 1, 2, 3])

long_df = wide_df.stack().stack().stack()

CSV ( , ):

from io import StringIO
from textwrap import dedent

import pandas

mockcsv = StringIO(dedent("""\
    num,,,this1,this1,this1,this1,that1,that1,that1,that1
    let,,,thisA,thisA,thatA,thatA,thisB,thisB,thatB,thatB
    animal,,,cat,dog,bird,lizard,cat,dog,bird,lizard
    a,b,c,,,,,,,,
    a1,b1,c1,x1,x2,x3,x4,x5,x6,x7,x8
    a1,b1,c2,y1,y2,y3,y4,y5,y6,y7,y8
    a1,b2,c1,z1,z2,z3,z4,z5,6z,zy,z8
"""))


wide_df = pandas.read_csv(mockcsv, index_col=[0, 1, 2], header=[0, 1, 2])
long_df = wide_df.stack().stack().stack()

wide_df :

num      this1                  that1                 
let      thisA     thatA        thisB     thatB       
animal     cat dog  bird lizard   cat dog  bird lizard
a  b  c                                               
a1 b1 c1    x1  x2    x3     x4    x5  x6    x7     x8
      c2    y1  y2    y3     y4    y5  y6    y7     y8
   b2 c1    z1  z2    z3     z4    z5  6z    zy     z8

long_df

a   b   c   animal  let    num  
a1  b1  c1  bird    thatA  this1    x3
                    thatB  that1    x7
            cat     thisA  this1    x1
                    thisB  that1    x5
            dog     thisA  this1    x2
                    thisB  that1    x6
            lizard  thatA  this1    x4
                    thatB  that1    x8
        c2  bird    thatA  this1    y3
                    thatB  that1    y7
            cat     thisA  this1    y1
                    thisB  that1    y5
            dog     thisA  this1    y2
                    thisB  that1    y6
            lizard  thatA  this1    y4
                    thatB  that1    y8
    b2  c1  bird    thatA  this1    z3
                    thatB  that1    zy
            cat     thisA  this1    z1
                    thisB  that1    z5
            dog     thisA  this1    z2
                    thisB  that1    6z
            lizard  thatA  this1    z4
                    thatB  that1    z8

, OP, , , :

index_names = ['a', 'b', 'c']
col_names = ['Level1', 'Level2', 'Level3']
df = (
    pandas.read_excel('Book1.xlsx', header=[0, 1, 2], index_col=[0, 1, 2, 3])
        .reset_index(level=0, drop=True)
        .rename_axis(index_names, axis='index')
        .rename_axis(col_names, axis='columns')
        .stack()
        .stack()
        .stack()
        .to_frame()
)

, , , index_names .

+1

DF .

first_half = df.iloc[:, :3]
second_half = df.iloc[:, 3:]

.

melt_second_half = pd.melt(second_half)

, , DF .

repeats = int(melt_second_half.shape[0]/first_half.shape[0])
first_reps = pd.concat([first_half] * repeats, ignore_index=True)
col_names = first_reps.columns.get_level_values(2)
melt_first_half = pd.DataFrame(first_reps.values, columns=col_names)

, DF .

df_concat = pd.concat([melt_first_half, melt_second_half], axis=1)
df_concat.sort_values('value').reset_index(drop=True)

Picture

+2

Source: https://habr.com/ru/post/1659263/


All Articles