How to convert IndexedTable to DataFrame in Julia?

Question

How to convert IndexedTable to DataFrame in Julia?

In quick explanatory work, they IndexedTablesseem much faster than DataFramesfor working with individual elements (for example, select or "update"), but DataFrameshave a more beautiful ecosystem of functionality, for example. print, export ..

So, at a certain point in the workflow, I would like to convert the IndexedTable to a DataFrame, for example.

using DataFrames, IndexedTables, IndexedTables.Table

tn = Table(
    Columns(
        param  = String["price","price","price","price","waterContent","waterContent"],
        item   = String["banana","banana","apple","apple","banana", "apple"],
        region = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA]
    ),
    Columns(
       value2000 = Float64[2.8,2.7,1.1,0.8,0.2,0.7],
       value2010 = Float64[3.2,2.9,1.2,0.8,0.2,0.8],
    )
)

to →

df_tn = DataFrame(
    param     = String["price","price","price","price","waterContent","waterContent"],
    item      = String["banana","banana","apple","apple","banana", "apple"],
    region    = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA],
    value2000 = Float64[2.8,2.7,1.1,0.8,0.2,0.7],
    value2010 = Float64[3.2,2.9,1.2,0.8,0.2,0.8],
)

or

t = Table(
    Columns(
        String["price","price","price","price","waterContent","waterContent"],
        String["banana","banana","apple","apple","banana", "apple"],
        Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA]
    ),
    Columns(
       Float64[2.8,2.7,1.1,0.8,0.2,0.7],
       Float64[3.2,2.9,1.2,0.8,0.2,0.8],
    )
)

to →

df_t = DataFrame(
    x1 = String["price","price","price","price","waterContent","waterContent"],
    x2 = String["banana","banana","apple","apple","banana", "apple"],
    x3 = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA],
    x4 = Float64[2.8,2.7,1.1,0.8,0.2,0.7],
    x5 = Float64[3.2,2.9,1.2,0.8,0.2,0.8]
)

I can find individual row values that interact over the table with pair():

for (i,pair) in enumerate(pairs(tn))
    rowValues = []
    for (j,section) in enumerate(pair)
        for item in section
            push!(rowValues,item)
        end
    end
    println(rowValues)
end

I cannot, however, get the names and types of columns, and I believe that working with a column will be more efficient.

EDIT: "" , , :

colTypes = Union{Union,DataType}[]

for item in tn.index.columns
  push!(colTypes, eltype(item))
end
for item in tn.data.columns
  push!(colTypes, eltype(item))
end

EDIT2. IndexedTable, , () Dan Getz, "" tuple, "" - :

t_named_idx = Table(
    Columns(
        param  = String["price","price","price","price","waterContent","waterContent"],
        item   = String["banana","banana","apple","apple","banana", "apple"],
        region = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA]
    ),
    Columns(
       Float64[2.8,2.7,1.1,0.8,0.2,0.7],
    )
)

, , API IndexedTable , , columns(t), .

+4

type-conversion dataframe julia-lang

Antonello 20 . '17 7:46

4

, "" ( , ):

julia> df = DataFrame(
         permutedims(  # <- structural transpose
           vcat(
             reshape([j for i in keys(t) for j in i], :, length(t)) , 
             reshape([j for i in t       for j in i], :, length(t))
           ), 
           (2,1)
         )
       )
6×5 DataFrames.DataFrame
│ Row │ x1             │ x2       │ x3   │ x4  │ x5  │
├─────┼────────────────┼──────────┼──────┼─────┼─────┤
│ 1   │ "price"        │ "apple"  │ "FR" │ 1.1 │ 1.2 │
│ 2   │ "price"        │ "apple"  │ "UK" │ 0.8 │ 0.8 │
│ 3   │ "price"        │ "banana" │ "FR" │ 2.8 │ 3.2 │
│ 4   │ "price"        │ "banana" │ "UK" │ 2.7 │ 2.9 │
│ 5   │ "waterContent" │ "apple"  │ NA   │ 0.7 │ 0.8 │
│ 6   │ "waterContent" │ "banana" │ NA   │ 0.2 │ 0.2 │

+1

Liso 20 . '17 11:56

Here, the initial value is to write a transform function. It stores column names and type. It would be nice if it could be cleaned and implemented in a DataFrame or IndexedTable package like convert(DataFrame,t::IndexedArray).

function toDataFrame(t::IndexedTable)

    # Note: the index is always a Tuple (named or not) while the data part can be a simple Array, a tuple or a Named tuple

    # Getting the column types.. this is independent if it is a keyed or normal IndexedArray
    colTypes = Union{Union,DataType}[]
    for item in t.index.columns
      push!(colTypes, eltype(item))
    end
    if(typeof(t.data) <: Vector)    # The Data part is a simple Array
        push!(colTypes, eltype(t.data))
    else                            # The data part is a Tuple
        for item in t.data.columns
          push!(colTypes, eltype(item))
        end
    end
    # Getting the column names.. this change if it is a keyed or normal IndexedArray
    colNames = Symbol[]
    lIdx  = length(t.index.columns)
    if(eltype(t.index.columns) <: AbstractVector) # normal Tuple
        [push!(colNames, Symbol("x",i)) for i in 1:lIdx]
    else                                          # NamedTuple
        for (k,v) in zip(keys(t.index.columns), t.index.columns)
            push!(colNames, k)
        end
    end
    if(typeof(t.data) <: Vector)    # The Data part is a simple single Array
        push!(colNames, Symbol("x",lIdx+1))
    else
        lData = length(t.data.columns)
        if(eltype(t.data.columns) <: AbstractVector)  # normal Tuple
            [push!(colNames, Symbol("x",i)) for i in (lIdx+1):(lIdx+lData)]
        else                                          # NamedTuple
            for (k,v) in zip(keys(t.data.columns), t.data.columns)
                push!(colNames, k)
            end
        end
    end
    # building an empty DataFrame..
    df = DataFrame()
    for i in 1:length(colTypes)
        df[colNames[i]] = colTypes[i][]
    end
    # and finally filling the df with values..
    for (i,pair) in enumerate(pairs(t))
        rowValues = []
        for (j,section) in enumerate(pair)
            for item in section
                push!(rowValues,item)
            end
        end
        push!(df, rowValues)
    end
    return df
end

0

Antonello Oct 20 '17 at 13:35

source share

Just set IterableTables and then

using IterableTables
df = DataFrames.DataFrame(it)

0

xiaodai Oct 26 '17 at 6:59

source share

Dan Getz · Accepted Answer · 2017-10-20T19:02:46+0000

:

toDataFrame(cols::Tuple, prefix="x") = 
  DataFrame(;(Symbol("$prefix$c") => cols[c] for c in fieldnames(cols))...)

toDataFrame(cols::NamedTuples.NamedTuple, prefix="x") = 
  DataFrame(;(c => cols[c] for c in fieldnames(cols))...)

toDataFrame(t::IndexedTable) = toDataFrame(columns(t))

( Julia 0.6 tn t, ):

julia> tn
param           item      region │ value2000  value2010
─────────────────────────────────┼─────────────────────
"price"         "apple"   "FR"   │ 1.1        1.2
"price"         "apple"   "UK"   │ 0.8        0.8
"price"         "banana"  "FR"   │ 2.8        3.2
"price"         "banana"  "UK"   │ 2.7        2.9
"waterContent"  "apple"   NA     │ 0.7        0.8
"waterContent"  "banana"  NA     │ 0.2        0.2

julia> df_tn = toDataFrame(tn)
6×5 DataFrames.DataFrame
│ Row │ param          │ item     │ region │ value2000 │ value2010 │
├─────┼────────────────┼──────────┼────────┼───────────┼───────────┤
│ 1   │ "price"        │ "apple"  │ "FR"   │ 1.1       │ 1.2       │
│ 2   │ "price"        │ "apple"  │ "UK"   │ 0.8       │ 0.8       │
│ 3   │ "price"        │ "banana" │ "FR"   │ 2.8       │ 3.2       │
│ 4   │ "price"        │ "banana" │ "UK"   │ 2.7       │ 2.9       │
│ 5   │ "waterContent" │ "apple"  │ NA     │ 0.7       │ 0.8       │
│ 6   │ "waterContent" │ "banana" │ NA     │ 0.2       │ 0.2       │

:

julia> typeof(df_tn[:,1])
DataArrays.DataArray{String,1}

julia> typeof(df_tn[:,4])
DataArrays.DataArray{Float64,1}

:

julia> t
───────────────────────────────┬─────────
"price"         "apple"   "FR" │ 1.1  1.2
"price"         "apple"   "UK" │ 0.8  0.8
"price"         "banana"  "FR" │ 2.8  3.2
"price"         "banana"  "UK" │ 2.7  2.9
"waterContent"  "apple"   NA   │ 0.7  0.8
"waterContent"  "banana"  NA   │ 0.2  0.2

julia> df_t = toDataFrame(t)
6×5 DataFrames.DataFrame
│ Row │ x1             │ x2       │ x3   │ x4  │ x5  │
├─────┼────────────────┼──────────┼──────┼─────┼─────┤
│ 1   │ "price"        │ "apple"  │ "FR" │ 1.1 │ 1.2 │
│ 2   │ "price"        │ "apple"  │ "UK" │ 0.8 │ 0.8 │
│ 3   │ "price"        │ "banana" │ "FR" │ 2.8 │ 3.2 │
│ 4   │ "price"        │ "banana" │ "UK" │ 2.7 │ 2.9 │
│ 5   │ "waterContent" │ "apple"  │ NA   │ 0.7 │ 0.8 │
│ 6   │ "waterContent" │ "banana" │ NA   │ 0.2 │ 0.2 │

EDIT: @Antonello, . , :

toDataFrame(t::IndexedTable) = 
  hcat(toDataFrame(columns(keys(t)),"y"),toDataFrame(columns(values(t))))

:

julia> toDataFrame(tn2)
6×5 DataFrames.DataFrame
│ Row │ param          │ item     │ region │ x1  │ x2  │
├─────┼────────────────┼──────────┼────────┼─────┼─────┤
│ 1   │ "price"        │ "apple"  │ "FR"   │ 1.1 │ 1.2 │
│ 2   │ "price"        │ "apple"  │ "UK"   │ 0.8 │ 0.8 │
│ 3   │ "price"        │ "banana" │ "FR"   │ 2.8 │ 3.2 │
│ 4   │ "price"        │ "banana" │ "UK"   │ 2.7 │ 2.9 │
│ 5   │ "waterContent" │ "apple"  │ NA     │ 0.7 │ 0.8 │
│ 6   │ "waterContent" │ "banana" │ NA     │ 0.2 │ 0.2 │

How to convert IndexedTable to DataFrame in Julia?

More articles: