How to convert IndexedTable to DataFrame in Julia?

In quick explanatory work, they IndexedTablesseem much faster than DataFramesfor working with individual elements (for example, select or "update"), but DataFrameshave a more beautiful ecosystem of functionality, for example. print, export ..

So, at a certain point in the workflow, I would like to convert the IndexedTable to a DataFrame, for example.

using DataFrames, IndexedTables, IndexedTables.Table

tn = Table(
    Columns(
        param  = String["price","price","price","price","waterContent","waterContent"],
        item   = String["banana","banana","apple","apple","banana", "apple"],
        region = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA]
    ),
    Columns(
       value2000 = Float64[2.8,2.7,1.1,0.8,0.2,0.7],
       value2010 = Float64[3.2,2.9,1.2,0.8,0.2,0.8],
    )
)

to β†’

df_tn = DataFrame(
    param     = String["price","price","price","price","waterContent","waterContent"],
    item      = String["banana","banana","apple","apple","banana", "apple"],
    region    = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA],
    value2000 = Float64[2.8,2.7,1.1,0.8,0.2,0.7],
    value2010 = Float64[3.2,2.9,1.2,0.8,0.2,0.8],
)

or

t = Table(
    Columns(
        String["price","price","price","price","waterContent","waterContent"],
        String["banana","banana","apple","apple","banana", "apple"],
        Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA]
    ),
    Columns(
       Float64[2.8,2.7,1.1,0.8,0.2,0.7],
       Float64[3.2,2.9,1.2,0.8,0.2,0.8],
    )
)

to β†’

df_t = DataFrame(
    x1 = String["price","price","price","price","waterContent","waterContent"],
    x2 = String["banana","banana","apple","apple","banana", "apple"],
    x3 = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA],
    x4 = Float64[2.8,2.7,1.1,0.8,0.2,0.7],
    x5 = Float64[3.2,2.9,1.2,0.8,0.2,0.8]
)

I can find individual row values ​​that interact over the table with pair():

for (i,pair) in enumerate(pairs(tn))
    rowValues = []
    for (j,section) in enumerate(pair)
        for item in section
            push!(rowValues,item)
        end
    end
    println(rowValues)
end

I cannot, however, get the names and types of columns, and I believe that working with a column will be more efficient.

EDIT: "" , , :

colTypes = Union{Union,DataType}[]

for item in tn.index.columns
  push!(colTypes, eltype(item))
end
for item in tn.data.columns
  push!(colTypes, eltype(item))
end

EDIT2. IndexedTable, , () Dan Getz, "" tuple, "" - :

t_named_idx = Table(
    Columns(
        param  = String["price","price","price","price","waterContent","waterContent"],
        item   = String["banana","banana","apple","apple","banana", "apple"],
        region = Union{String,DataArrays.NAtype}["FR","UK","FR","UK",NA,NA]
    ),
    Columns(
       Float64[2.8,2.7,1.1,0.8,0.2,0.7],
    )
)

, , API IndexedTable , , columns(t), .

+4
4

:

toDataFrame(cols::Tuple, prefix="x") = 
  DataFrame(;(Symbol("$prefix$c") => cols[c] for c in fieldnames(cols))...)

toDataFrame(cols::NamedTuples.NamedTuple, prefix="x") = 
  DataFrame(;(c => cols[c] for c in fieldnames(cols))...)

toDataFrame(t::IndexedTable) = toDataFrame(columns(t))

( Julia 0.6 tn t, ):

julia> tn
param           item      region β”‚ value2000  value2010
─────────────────────────────────┼─────────────────────
"price"         "apple"   "FR"   β”‚ 1.1        1.2
"price"         "apple"   "UK"   β”‚ 0.8        0.8
"price"         "banana"  "FR"   β”‚ 2.8        3.2
"price"         "banana"  "UK"   β”‚ 2.7        2.9
"waterContent"  "apple"   NA     β”‚ 0.7        0.8
"waterContent"  "banana"  NA     β”‚ 0.2        0.2

julia> df_tn = toDataFrame(tn)
6Γ—5 DataFrames.DataFrame
β”‚ Row β”‚ param          β”‚ item     β”‚ region β”‚ value2000 β”‚ value2010 β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ "price"        β”‚ "apple"  β”‚ "FR"   β”‚ 1.1       β”‚ 1.2       β”‚
β”‚ 2   β”‚ "price"        β”‚ "apple"  β”‚ "UK"   β”‚ 0.8       β”‚ 0.8       β”‚
β”‚ 3   β”‚ "price"        β”‚ "banana" β”‚ "FR"   β”‚ 2.8       β”‚ 3.2       β”‚
β”‚ 4   β”‚ "price"        β”‚ "banana" β”‚ "UK"   β”‚ 2.7       β”‚ 2.9       β”‚
β”‚ 5   β”‚ "waterContent" β”‚ "apple"  β”‚ NA     β”‚ 0.7       β”‚ 0.8       β”‚
β”‚ 6   β”‚ "waterContent" β”‚ "banana" β”‚ NA     β”‚ 0.2       β”‚ 0.2       β”‚

:

julia> typeof(df_tn[:,1])
DataArrays.DataArray{String,1}

julia> typeof(df_tn[:,4])
DataArrays.DataArray{Float64,1}

:

julia> t
───────────────────────────────┬─────────
"price"         "apple"   "FR" β”‚ 1.1  1.2
"price"         "apple"   "UK" β”‚ 0.8  0.8
"price"         "banana"  "FR" β”‚ 2.8  3.2
"price"         "banana"  "UK" β”‚ 2.7  2.9
"waterContent"  "apple"   NA   β”‚ 0.7  0.8
"waterContent"  "banana"  NA   β”‚ 0.2  0.2

julia> df_t = toDataFrame(t)
6Γ—5 DataFrames.DataFrame
β”‚ Row β”‚ x1             β”‚ x2       β”‚ x3   β”‚ x4  β”‚ x5  β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ "price"        β”‚ "apple"  β”‚ "FR" β”‚ 1.1 β”‚ 1.2 β”‚
β”‚ 2   β”‚ "price"        β”‚ "apple"  β”‚ "UK" β”‚ 0.8 β”‚ 0.8 β”‚
β”‚ 3   β”‚ "price"        β”‚ "banana" β”‚ "FR" β”‚ 2.8 β”‚ 3.2 β”‚
β”‚ 4   β”‚ "price"        β”‚ "banana" β”‚ "UK" β”‚ 2.7 β”‚ 2.9 β”‚
β”‚ 5   β”‚ "waterContent" β”‚ "apple"  β”‚ NA   β”‚ 0.7 β”‚ 0.8 β”‚
β”‚ 6   β”‚ "waterContent" β”‚ "banana" β”‚ NA   β”‚ 0.2 β”‚ 0.2 β”‚

EDIT: @Antonello, . , :

toDataFrame(t::IndexedTable) = 
  hcat(toDataFrame(columns(keys(t)),"y"),toDataFrame(columns(values(t))))

:

julia> toDataFrame(tn2)
6Γ—5 DataFrames.DataFrame
β”‚ Row β”‚ param          β”‚ item     β”‚ region β”‚ x1  β”‚ x2  β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ "price"        β”‚ "apple"  β”‚ "FR"   β”‚ 1.1 β”‚ 1.2 β”‚
β”‚ 2   β”‚ "price"        β”‚ "apple"  β”‚ "UK"   β”‚ 0.8 β”‚ 0.8 β”‚
β”‚ 3   β”‚ "price"        β”‚ "banana" β”‚ "FR"   β”‚ 2.8 β”‚ 3.2 β”‚
β”‚ 4   β”‚ "price"        β”‚ "banana" β”‚ "UK"   β”‚ 2.7 β”‚ 2.9 β”‚
β”‚ 5   β”‚ "waterContent" β”‚ "apple"  β”‚ NA     β”‚ 0.7 β”‚ 0.8 β”‚
β”‚ 6   β”‚ "waterContent" β”‚ "banana" β”‚ NA     β”‚ 0.2 β”‚ 0.2 β”‚
+3

, "" ( , ):

julia> df = DataFrame(
         permutedims(  # <- structural transpose
           vcat(
             reshape([j for i in keys(t) for j in i], :, length(t)) , 
             reshape([j for i in t       for j in i], :, length(t))
           ), 
           (2,1)
         )
       )
6Γ—5 DataFrames.DataFrame
β”‚ Row β”‚ x1             β”‚ x2       β”‚ x3   β”‚ x4  β”‚ x5  β”‚
β”œβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€
β”‚ 1   β”‚ "price"        β”‚ "apple"  β”‚ "FR" β”‚ 1.1 β”‚ 1.2 β”‚
β”‚ 2   β”‚ "price"        β”‚ "apple"  β”‚ "UK" β”‚ 0.8 β”‚ 0.8 β”‚
β”‚ 3   β”‚ "price"        β”‚ "banana" β”‚ "FR" β”‚ 2.8 β”‚ 3.2 β”‚
β”‚ 4   β”‚ "price"        β”‚ "banana" β”‚ "UK" β”‚ 2.7 β”‚ 2.9 β”‚
β”‚ 5   β”‚ "waterContent" β”‚ "apple"  β”‚ NA   β”‚ 0.7 β”‚ 0.8 β”‚
β”‚ 6   β”‚ "waterContent" β”‚ "banana" β”‚ NA   β”‚ 0.2 β”‚ 0.2 β”‚
+1

Here, the initial value is to write a transform function. It stores column names and type. It would be nice if it could be cleaned and implemented in a DataFrame or IndexedTable package like convert(DataFrame,t::IndexedArray).

function toDataFrame(t::IndexedTable)

    # Note: the index is always a Tuple (named or not) while the data part can be a simple Array, a tuple or a Named tuple

    # Getting the column types.. this is independent if it is a keyed or normal IndexedArray
    colTypes = Union{Union,DataType}[]
    for item in t.index.columns
      push!(colTypes, eltype(item))
    end
    if(typeof(t.data) <: Vector)    # The Data part is a simple Array
        push!(colTypes, eltype(t.data))
    else                            # The data part is a Tuple
        for item in t.data.columns
          push!(colTypes, eltype(item))
        end
    end
    # Getting the column names.. this change if it is a keyed or normal IndexedArray
    colNames = Symbol[]
    lIdx  = length(t.index.columns)
    if(eltype(t.index.columns) <: AbstractVector) # normal Tuple
        [push!(colNames, Symbol("x",i)) for i in 1:lIdx]
    else                                          # NamedTuple
        for (k,v) in zip(keys(t.index.columns), t.index.columns)
            push!(colNames, k)
        end
    end
    if(typeof(t.data) <: Vector)    # The Data part is a simple single Array
        push!(colNames, Symbol("x",lIdx+1))
    else
        lData = length(t.data.columns)
        if(eltype(t.data.columns) <: AbstractVector)  # normal Tuple
            [push!(colNames, Symbol("x",i)) for i in (lIdx+1):(lIdx+lData)]
        else                                          # NamedTuple
            for (k,v) in zip(keys(t.data.columns), t.data.columns)
                push!(colNames, k)
            end
        end
    end
    # building an empty DataFrame..
    df = DataFrame()
    for i in 1:length(colTypes)
        df[colNames[i]] = colTypes[i][]
    end
    # and finally filling the df with values..
    for (i,pair) in enumerate(pairs(t))
        rowValues = []
        for (j,section) in enumerate(pair)
            for item in section
                push!(rowValues,item)
            end
        end
        push!(df, rowValues)
    end
    return df
end
0
source

Just set IterableTables and then

using IterableTables
df = DataFrames.DataFrame(it)
0
source

Source: https://habr.com/ru/post/1687858/


All Articles