Combination selection, order and tree

I have the following data that represents a person’s choice sequence between four values ​​(f1, f2, c1, c2):

df=structure(list(combi = structure(c(24L, 8L, 3L, 19L, 4L, 23L, 15L, 12L, 14L, 22L, 5L, 13L, 18L, 9L, 2L, 25L, 11L, 7L, 21L, 10L, 6L, 17L, 20L, 16L), .Label = c("", "c1-c2-f1-f2", "c1-c2-f2-f1", "c1-f1-c2-f2", "c1-f1-f2-c2", "c1-f2-c2-f1", "c1-f2-f1-c2", "c2-c1-f1-f2", "c2-c1-f2-f1", "c2-f1-c1-f2", "c2-f1-f2-c1", "c2-f2-c1-f1", "c2-f2-f1-c1", "f1-c1-c2-f2", "f1-c1-f2-c2", "f1-c2-c1-f2", "f1-c2-f2-c1", "f1-f2-c1-c2", "f1-f2-c2-c1", "f2-c1-c2-f1", "f2-c1-f1-c2", "f2-c2-c1-f1", "f2-c2-f1-c1", "f2-f1-c1-c2", "f2-f1-c2-c1"), class = "factor"), nb = c(10L, 0L, 2L, 4L, 1L, 5L, 1L, 2L, 1L, 3L, 1L, 0L, 3L, 5L, 0L, 18L, 5L, 2L, 5L, 0L, 4L, 4L, 11L, 2L)), .Names = c("combi", "nb"), class = "data.frame", row.names = c(1L, 3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L)) 

I am wondering if there is a tree view (or else) that can quantify for each step selection number, taking into account the subchain chain, which are common. Example:

 f2 (52) -f1 (28) -c1-c2 (10) -c2-c1 (18) 

f2 (52) there are 52 times chains starting with f2. there is a 28-fold chain starting with f2-f1.

Thank you very much.

+5
source share
3 answers

If you read combi values ​​in (using as.character ), you can expand these values ​​in character columns:

 df2 <- cbind(df, read.table(text=as.character(df$combi), sep="-",stringsAsFactors=FALSE) ) 

Then you can perform a tab at any level you want:

  xtabs(nb~V1, data=df2) # First level only #V1 #c1 c2 f1 f2 #10 12 15 52 xtabs(nb~paste(V1,V2,sep="-"), data=df2) # first and second #-- # paste(V1, V2, sep = "-") #c1-c2 c1-f1 c1-f2 c2-c1 c2-f1 c2-f2 f1-c1 f1-c2 f1-f2 f2-c1 f2-c2 f2-f1 # 2 2 6 5 5 2 2 6 7 16 8 28 

You can also expand the addmargins function to compactly display two subpositions of the senior position:

  addmargins( xtabs(nb~V1+V2, data=df2)) #========= V2 V1 c1 c2 f1 f2 Sum c1 0 2 2 6 10 c2 5 0 5 2 12 f1 2 6 0 7 15 f2 16 8 28 0 52 Sum 23 16 35 15 89 

This can be flattened using ftable :

  ftable( addmargins( xtabs(nb~V1+V2, data=df2)), row.vars=1:2) V1 V2 c1 c1 0 c2 2 f1 2 f2 6 Sum 10 c2 c1 5 c2 0 f1 5 f2 2 Sum 12 f1 c1 2 c2 6 f1 0 f2 7 Sum 15 f2 c1 16 c2 8 f1 28 f2 0 Sum 52 Sum c1 23 c2 16 f1 35 f2 15 Sum 89 

And the last result:

 xtabs(nb~paste(V1,V2,V3,V4,sep="-"), data=df2) #----- paste(V1, V2, V3, V4, sep = "-") c1-c2-f1-f2 c1-c2-f2-f1 c1-f1-c2-f2 c1-f1-f2-c2 c1-f2-c2-f1 c1-f2-f1-c2 c2-c1-f1-f2 c2-c1-f2-f1 0 2 1 1 4 2 0 5 c2-f1-c1-f2 c2-f1-f2-c1 c2-f2-c1-f1 c2-f2-f1-c1 f1-c1-c2-f2 f1-c1-f2-c2 f1-c2-c1-f2 f1-c2-f2-c1 0 5 2 0 1 1 2 4 f1-f2-c1-c2 f1-f2-c2-c1 f2-c1-c2-f1 f2-c1-f1-c2 f2-c2-c1-f1 f2-c2-f1-c1 f2-f1-c1-c2 f2-f1-c2-c1 3 4 11 5 3 5 10 18 

To see all this in a column:

 as.matrix( xtabs(nb~paste(V1,V2,V3,V4,sep="-"), data=df2) ) #---------------- [,1] c1-c2-f1-f2 0 c1-c2-f2-f1 2 c1-f1-c2-f2 1 c1-f1-f2-c2 1 c1-f2-c2-f1 4 c1-f2-f1-c2 2 c2-c1-f1-f2 0 c2-c1-f2-f1 5 c2-f1-c1-f2 0 c2-f1-f2-c1 5 c2-f2-c1-f1 2 c2-f2-f1-c1 0 f1-c1-c2-f2 1 f1-c1-f2-c2 1 f1-c2-c1-f2 2 f1-c2-f2-c1 4 f1-f2-c1-c2 3 f1-f2-c2-c1 4 f2-c1-c2-f1 11 f2-c1-f1-c2 5 f2-c2-c1-f1 3 f2-c2-f1-c1 5 f2-f1-c1-c2 10 f2-f1-c2-c1 18 

I believe that the "final answer with all subtotals could be:

  ftable( addmargins( xtabs(nb~V1+V2+paste(V3,V4,sep="-"), data=df2)), row.vars=1:3) 

However, it has so many null entries, which I hesitate to recommend. You can cut zero lines:

 my.ftable <- ftable( addmargins( xtabs(nb~V1+V2+paste(V3,V4,sep="-"), data=df2)), row.vars=1:3) my.df.table <- as.data.frame(my.ftable) names(my.df.table)[3] <- "3rd_4th" my.df.table[ my.df.table$Freq > 0, ] #--------- V1 V2 3rd_4th Freq 14 f2 f1 c1-c2 10 15 Sum f1 c1-c2 10 18 f1 f2 c1-c2 3 20 Sum f2 c1-c2 3 23 f1 Sum c1-c2 3 24 f2 Sum c1-c2 10 25 Sum Sum c1-c2 13 34 f2 c2 c1-f1 3 35 Sum c2 c1-f1 3 42 c2 f2 c1-f1 2 45 Sum f2 c1-f1 2 47 c2 Sum c1-f1 2 49 f2 Sum c1-f1 3 50 Sum Sum c1-f1 5 # and many more rows #... until 321 c1 Sum Sum 10 322 c2 Sum Sum 12 323 f1 Sum Sum 15 324 f2 Sum Sum 52 325 Sum Sum Sum 89 
+4
source

It may not be exactly what you mean by "tree structure", but it gives you the numbers in the table using the R base. It should be easy to format as you like from this result.

 df=structure(list(combi = structure(c(24L, 8L, 3L, 19L, 4L, 23L, 15L, 12L, 14L, 22L, 5L, 13L, 18L, 9L, 2L, 25L, 11L, 7L, 21L, 10L, 6L, 17L, 20L, 16L), .Label = c("", "c1-c2-f1-f2", "c1-c2-f2-f1", "c1-f1-c2-f2", "c1-f1-f2-c2", "c1-f2-c2-f1", "c1-f2-f1-c2", "c2-c1-f1-f2", "c2-c1-f2-f1", "c2-f1-c1-f2", "c2-f1-f2-c1", "c2-f2-c1-f1", "c2-f2-f1-c1", "f1-c1-c2-f2", "f1-c1-f2-c2", "f1-c2-c1-f2", "f1-c2-f2-c1", "f1-f2-c1-c2", "f1-f2-c2-c1", "f2-c1-c2-f1", "f2-c1-f1-c2", "f2-c2-c1-f1", "f2-c2-f1-c1", "f2-f1-c1-c2", "f2-f1-c2-c1"), class = "factor"), nb = c(10L, 0L, 2L, 4L, 1L, 5L, 1L, 2L, 1L, 3L, 1L, 0L, 3L, 5L, 0L, 18L, 5L, 2L, 5L, 0L, 4L, 4L, 11L, 2L)), .Names = c("combi", "nb"), class = "data.frame", row.names = c(1L, 3L, 5L, 7L, 9L, 11L, 13L, 15L, 17L, 19L, 21L, 23L, 25L, 27L, 29L, 31L, 33L, 35L, 37L, 39L, 41L, 43L, 45L, 47L)) tmp <- sapply(as.character(df$combi), strsplit, split = "-") tmp <- do.call(rbind, tmp) colnames(tmp) <- paste0("str", 1:4) rownames(tmp) <- NULL tmp <- data.frame(df, tmp) tmp$str3 <- paste(tmp$str3, tmp$str4, sep = "-") str1 <- aggregate(list(nb_str1 = tmp[,"nb"]), tmp["str1"], sum) str2 <- aggregate(list(nb_str2 = tmp[,"nb"]), tmp[c("str1", "str2")], sum) str3 <- aggregate(list(nb_str3 = tmp[,"nb"]), tmp[c("str1", "str2", "str3")], sum) tmp <- merge(str3, str1) tmp <- merge(tmp, str2) tmp <- tmp[, c("str1", "nb_str1", "str2", "nb_str2", "str3", "nb_str3")] tmp #> str1 nb_str1 str2 nb_str2 str3 nb_str3 #> 1 c1 10 c2 2 f1-f2 0 #> 2 c1 10 c2 2 f2-f1 2 #> 3 c1 10 f1 2 c2-f2 1 #> 4 c1 10 f1 2 f2-c2 1 #> 5 c1 10 f2 6 c2-f1 4 #> 6 c1 10 f2 6 f1-c2 2 #> 7 c2 12 c1 5 f1-f2 0 #> 8 c2 12 c1 5 f2-f1 5 #> 9 c2 12 f1 5 c1-f2 0 #> 10 c2 12 f1 5 f2-c1 5 #> 11 c2 12 f2 2 c1-f1 2 #> 12 c2 12 f2 2 f1-c1 0 #> 13 f1 15 c1 2 c2-f2 1 #> 14 f1 15 c1 2 f2-c2 1 #> 15 f1 15 c2 6 c1-f2 2 #> 16 f1 15 c2 6 f2-c1 4 #> 17 f1 15 f2 7 c1-c2 3 #> 18 f1 15 f2 7 c2-c1 4 #> 19 f2 52 c1 16 c2-f1 11 #> 20 f2 52 c1 16 f1-c2 5 #> 21 f2 52 c2 8 c1-f1 3 #> 22 f2 52 c2 8 f1-c1 5 #> 23 f2 52 f1 28 c1-c2 10 #> 24 f2 52 f1 28 c2-c1 18 

Created 2018-03-15 reprex package (v0.2.0).

+4
source

data.tree package specializes in tree view. It is based on partitioning variables in a hierarchical order, for example world → continent → country → city. In your case, you mentioned each order for c1 , c2 , f1 and f2 . You will probably need to make four trees, for example. c1 → or c2, f1 or f2, each of which leads to two unused values, and then draws them.

A basic example starting with c1 and then turning off and not including specific values:

 library(data.tree) c1 <- Node$new("c1") # 1st level chain, "c1" c2 <- c1$AddChild("c2") # new 2nd level chain, "c2", off c1 f1 <- c2$AddChild("f1-f2") # new level off c2 f2 <- c2$AddChild("f2-f1") # new level off c2 f1 <- c1$AddChild("f1") # new 2nd level chain, "f1", off c1 c2 <- f1$AddChild("c2-f2") # new level off f1 f2 <- f1$AddChild("f2-c2") # new level off f1 f2 <- c1$AddChild("f2") # new 2nd level chain, "f2", off c1 c2 <- f2$AddChild("c2-f1") # new level off f2 f1 <- f2$AddChild("f1-c2") # new level off f2 print(c1) levelName 1 c1 2 ¦--c2 3 ¦ ¦--f1-f2 4 ¦ °--f2-f1 5 ¦--f1 6 ¦ ¦--c2-f2 7 ¦ °--f2-c2 8 °--f2 9 ¦--c2-f1 10 °--f1-c2 plot(c1) 

enter image description here

+3
source

Source: https://habr.com/ru/post/1275866/


All Articles