Combination selection, order and tree

Question

Combination selection, order and tree

I have the following data that represents a person’s choice sequence between four values (f1, f2, c1, c2):

df=structure(list(combi = structure(c(24L, 8L, 3L, 19L, 4L, 23L, 15L, 12L, 14L, 22L, 5L, 13L, 18L, 9L, 2L, 25L, 11L, 7L, 21L, 10L, 6L, 17L, 20L, 16L), .Label = c("", "c1-c2-f1-f2", "c1-c2-f2-f1", "c1-f1-c2-f2", "c1-f1-f2-c2", "c1-f2-c2-f1", "c1-f2-f1-c2", "c2-c1-f1-f2", "c2-c1-f2-f1", "c2-f1-c1-f2", "c2-f1-f2-c1", "c2-f2-c1-f1", "c2-f2-f1-c1", "f1-c1-c2-f2", "f1-c1-f2-c2", "f1-c2-c1-f2", "f1-c2-f2-c1", "f1-f2-c1-c2", "f1-f2-c2-c1", "f2-c1-c2-f1", "f2-c1-f1-c2", "f2-c2-c1-f1", "f2-c2-f1-c1", "f2-f1-c1-c2", "f2-f1-c2-c1"), class = "factor"), nb = c(10L, 0L, 2L, 4L, 1L, 5L, 1L, 2L, 1L, 3L, 1L, 0L, 3L, 5L, 0L, 18L, 5L, 2L, 5L, 0L, 4L, 4L, 11L, 2L)), .Names = c("combi", "nb"), class = "data.frame", row.names = c(1L, 3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L))

I am wondering if there is a tree view (or else) that can quantify for each step selection number, taking into account the subchain chain, which are common. Example:

 f2 (52) -f1 (28) -c1-c2 (10) -c2-c1 (18)

f2 (52) there are 52 times chains starting with f2. there is a 28-fold chain starting with f2-f1.

Thank you very much.

+5

r ggplot2

ranell Mar 13 '18 at 13:19

source share

3 answers

It may not be exactly what you mean by "tree structure", but it gives you the numbers in the table using the R base. It should be easy to format as you like from this result.

 df=structure(list(combi = structure(c(24L, 8L, 3L, 19L, 4L, 23L, 15L, 12L, 14L, 22L, 5L, 13L, 18L, 9L, 2L, 25L, 11L, 7L, 21L, 10L, 6L, 17L, 20L, 16L), .Label = c("", "c1-c2-f1-f2", "c1-c2-f2-f1", "c1-f1-c2-f2", "c1-f1-f2-c2", "c1-f2-c2-f1", "c1-f2-f1-c2", "c2-c1-f1-f2", "c2-c1-f2-f1", "c2-f1-c1-f2", "c2-f1-f2-c1", "c2-f2-c1-f1", "c2-f2-f1-c1", "f1-c1-c2-f2", "f1-c1-f2-c2", "f1-c2-c1-f2", "f1-c2-f2-c1", "f1-f2-c1-c2", "f1-f2-c2-c1", "f2-c1-c2-f1", "f2-c1-f1-c2", "f2-c2-c1-f1", "f2-c2-f1-c1", "f2-f1-c1-c2", "f2-f1-c2-c1"), class = "factor"), nb = c(10L, 0L, 2L, 4L, 1L, 5L, 1L, 2L, 1L, 3L, 1L, 0L, 3L, 5L, 0L, 18L, 5L, 2L, 5L, 0L, 4L, 4L, 11L, 2L)), .Names = c("combi", "nb"), class = "data.frame", row.names = c(1L, 3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L)) tmp <- sapply(as.character(df$combi), strsplit, split = "-") tmp <- do.call(rbind, tmp) colnames(tmp) <- paste0("str", 1:4) rownames(tmp) <- NULL tmp <- data.frame(df, tmp) tmp$str3 <- paste(tmp$str3, tmp$str4, sep = "-") str1 <- aggregate(list(nb_str1 = tmp[,"nb"]), tmp["str1"], sum) str2 <- aggregate(list(nb_str2 = tmp[,"nb"]), tmp[c("str1", "str2")], sum) str3 <- aggregate(list(nb_str3 = tmp[,"nb"]), tmp[c("str1", "str2", "str3")], sum) tmp <- merge(str3, str1) tmp <- merge(tmp, str2) tmp <- tmp[, c("str1", "nb_str1", "str2", "nb_str2", "str3", "nb_str3")] tmp #> str1 nb_str1 str2 nb_str2 str3 nb_str3 #> 1 c1 10 c2 2 f1-f2 0 #> 2 c1 10 c2 2 f2-f1 2 #> 3 c1 10 f1 2 c2-f2 1 #> 4 c1 10 f1 2 f2-c2 1 #> 5 c1 10 f2 6 c2-f1 4 #> 6 c1 10 f2 6 f1-c2 2 #> 7 c2 12 c1 5 f1-f2 0 #> 8 c2 12 c1 5 f2-f1 5 #> 9 c2 12 f1 5 c1-f2 0 #> 10 c2 12 f1 5 f2-c1 5 #> 11 c2 12 f2 2 c1-f1 2 #> 12 c2 12 f2 2 f1-c1 0 #> 13 f1 15 c1 2 c2-f2 1 #> 14 f1 15 c1 2 f2-c2 1 #> 15 f1 15 c2 6 c1-f2 2 #> 16 f1 15 c2 6 f2-c1 4 #> 17 f1 15 f2 7 c1-c2 3 #> 18 f1 15 f2 7 c2-c1 4 #> 19 f2 52 c1 16 c2-f1 11 #> 20 f2 52 c1 16 f1-c2 5 #> 21 f2 52 c2 8 c1-f1 3 #> 22 f2 52 c2 8 f1-c1 5 #> 23 f2 52 f1 28 c1-c2 10 #> 24 f2 52 f1 28 c2-c1 18

Created 2018-03-15 reprex package (v0.2.0).

+4

Gilles Mar 15 '18 at 19:35

source share

data.tree package specializes in tree view. It is based on partitioning variables in a hierarchical order, for example world → continent → country → city. In your case, you mentioned each order for c1 , c2 , f1 and f2 . You will probably need to make four trees, for example. c1 → or c2, f1 or f2, each of which leads to two unused values, and then draws them.

A basic example starting with c1 and then turning off and not including specific values:

 library(data.tree) c1 <- Node$new("c1") # 1st level chain, "c1" c2 <- c1$AddChild("c2") # new 2nd level chain, "c2", off c1 f1 <- c2$AddChild("f1-f2") # new level off c2 f2 <- c2$AddChild("f2-f1") # new level off c2 f1 <- c1$AddChild("f1") # new 2nd level chain, "f1", off c1 c2 <- f1$AddChild("c2-f2") # new level off f1 f2 <- f1$AddChild("f2-c2") # new level off f1 f2 <- c1$AddChild("f2") # new 2nd level chain, "f2", off c1 c2 <- f2$AddChild("c2-f1") # new level off f2 f1 <- f2$AddChild("f1-c2") # new level off f2 print(c1) levelName 1 c1 2 ¦--c2 3 ¦ ¦--f1-f2 4 ¦ °--f2-f1 5 ¦--f1 6 ¦ ¦--c2-f2 7 ¦ °--f2-c2 8 °--f2 9 ¦--c2-f1 10 °--f1-c2 plot(c1)

+3

Morgorg Mar 20 '18 at 6:26

source share

42- · Accepted Answer · 2018-03-19T22:00:40+0000

If you read combi values in (using as.character ), you can expand these values in character columns:

 df2 <- cbind(df, read.table(text=as.character(df$combi), sep="-",stringsAsFactors=FALSE) )

Then you can perform a tab at any level you want:

  xtabs(nb~V1, data=df2) # First level only #V1 #c1 c2 f1 f2 #10 12 15 52 xtabs(nb~paste(V1,V2,sep="-"), data=df2) # first and second #-- # paste(V1, V2, sep = "-") #c1-c2 c1-f1 c1-f2 c2-c1 c2-f1 c2-f2 f1-c1 f1-c2 f1-f2 f2-c1 f2-c2 f2-f1 # 2 2 6 5 5 2 2 6 7 16 8 28

You can also expand the addmargins function to compactly display two subpositions of the senior position:

  addmargins( xtabs(nb~V1+V2, data=df2)) #========= V2 V1 c1 c2 f1 f2 Sum c1 0 2 2 6 10 c2 5 0 5 2 12 f1 2 6 0 7 15 f2 16 8 28 0 52 Sum 23 16 35 15 89

This can be flattened using ftable :

  ftable( addmargins( xtabs(nb~V1+V2, data=df2)), row.vars=1:2) V1 V2 c1 c1 0 c2 2 f1 2 f2 6 Sum 10 c2 c1 5 c2 0 f1 5 f2 2 Sum 12 f1 c1 2 c2 6 f1 0 f2 7 Sum 15 f2 c1 16 c2 8 f1 28 f2 0 Sum 52 Sum c1 23 c2 16 f1 35 f2 15 Sum 89

And the last result:

 xtabs(nb~paste(V1,V2,V3,V4,sep="-"), data=df2) #----- paste(V1, V2, V3, V4, sep = "-") c1-c2-f1-f2 c1-c2-f2-f1 c1-f1-c2-f2 c1-f1-f2-c2 c1-f2-c2-f1 c1-f2-f1-c2 c2-c1-f1-f2 c2-c1-f2-f1 0 2 1 1 4 2 0 5 c2-f1-c1-f2 c2-f1-f2-c1 c2-f2-c1-f1 c2-f2-f1-c1 f1-c1-c2-f2 f1-c1-f2-c2 f1-c2-c1-f2 f1-c2-f2-c1 0 5 2 0 1 1 2 4 f1-f2-c1-c2 f1-f2-c2-c1 f2-c1-c2-f1 f2-c1-f1-c2 f2-c2-c1-f1 f2-c2-f1-c1 f2-f1-c1-c2 f2-f1-c2-c1 3 4 11 5 3 5 10 18

To see all this in a column:

 as.matrix( xtabs(nb~paste(V1,V2,V3,V4,sep="-"), data=df2) ) #---------------- [,1] c1-c2-f1-f2 0 c1-c2-f2-f1 2 c1-f1-c2-f2 1 c1-f1-f2-c2 1 c1-f2-c2-f1 4 c1-f2-f1-c2 2 c2-c1-f1-f2 0 c2-c1-f2-f1 5 c2-f1-c1-f2 0 c2-f1-f2-c1 5 c2-f2-c1-f1 2 c2-f2-f1-c1 0 f1-c1-c2-f2 1 f1-c1-f2-c2 1 f1-c2-c1-f2 2 f1-c2-f2-c1 4 f1-f2-c1-c2 3 f1-f2-c2-c1 4 f2-c1-c2-f1 11 f2-c1-f1-c2 5 f2-c2-c1-f1 3 f2-c2-f1-c1 5 f2-f1-c1-c2 10 f2-f1-c2-c1 18

I believe that the "final answer with all subtotals could be:

  ftable( addmargins( xtabs(nb~V1+V2+paste(V3,V4,sep="-"), data=df2)), row.vars=1:3)

However, it has so many null entries, which I hesitate to recommend. You can cut zero lines:

 my.ftable <- ftable( addmargins( xtabs(nb~V1+V2+paste(V3,V4,sep="-"), data=df2)), row.vars=1:3) my.df.table <- as.data.frame(my.ftable) names(my.df.table)[3] <- "3rd_4th" my.df.table[ my.df.table$Freq > 0, ] #--------- V1 V2 3rd_4th Freq 14 f2 f1 c1-c2 10 15 Sum f1 c1-c2 10 18 f1 f2 c1-c2 3 20 Sum f2 c1-c2 3 23 f1 Sum c1-c2 3 24 f2 Sum c1-c2 10 25 Sum Sum c1-c2 13 34 f2 c2 c1-f1 3 35 Sum c2 c1-f1 3 42 c2 f2 c1-f1 2 45 Sum f2 c1-f1 2 47 c2 Sum c1-f1 2 49 f2 Sum c1-f1 3 50 Sum Sum c1-f1 5 # and many more rows #... until 321 c1 Sum Sum 10 322 c2 Sum Sum 12 323 f1 Sum Sum 15 324 f2 Sum Sum 52 325 Sum Sum Sum 89

Combination selection, order and tree

More articles: