Using dplyr
and tidyr
(updated version of plyr
):
library(dplyr) library(tidyr) df %>% group_by(col1, col2) %>% summarise(n = n()) %>% mutate(m = min_rank(desc(n)), count = sum(n)) %>% filter(m <= 3 & (m != max(m) | m == 1)) %>% group_by(col1, m, count) %>% summarize(a = paste(col2, collapse = ',')) %>% spread(m, a, sep = '') %>% ungroup # # A tibble: 7 × 5 # col1 count m1 m2 m3 # * <fctr> <int> <chr> <chr> <chr> # 1 group1 12 4 3 2,6 # 2 group2 12 2 <NA> <NA> # 3 group3 12 4 5,7 <NA> # 4 group4 12 4,5 <NA> <NA> # 5 group5 12 4,5 <NA> <NA> # 6 group6 12 1,2,3,4 <NA> <NA> # 7 group7 16 16 14,42 <NA>
If you need this inside a function:
get.3modes.andcounts <- function(origtable, groupby, columnname) { origtable %>% group_by_(groupby, columnname) %>% summarise(n = n()) %>% mutate(r = min_rank(desc(n)), count = sum(n)) %>% filter(r <= 3 & (r != max(r) | r == 1)) %>% group_by_(groupby, 'r', 'count') %>% summarize_(a = paste0('paste(',columnname, ', collapse = ",")')) %>% spread(r, a, sep = '') %>% ungroup } get.3modes.andcounts(df, 'col1', 'col2') # # A tibble: 7 × 5 # col1 count m1 m2 m3 # * <fctr> <int> <chr> <chr> <chr> # 1 group1 12 4 3 2,6 # 2 group2 12 2 <NA> <NA> # 3 group3 12 4 5,7 <NA> # 4 group4 12 4,5 <NA> <NA> # 5 group5 12 4,5 <NA> <NA> # 6 group6 12 1,2,3,4 <NA> <NA> # 7 group7 16 16 14,42 <NA>
System.time
system.time(get.3modes.andcounts(df, 'col1', 'col2')) # user system elapsed # 0.012 0.000 0.011 benchmark(get.3modes.andcounts(df, 'col1', 'col2'), replications = 10, columns = c("test", "replications", "elapsed")) # test replications elapsed # 1 get.3modes.andcounts(df, "col1", "col2") 10 0.08 benchmark(get.3modes.andcounts(df, 'col1', 'col2'), replications = 100, columns = c("test", "replications", "elapsed")) # test replications elapsed # 1 get.3modes.andcounts(df, "col1", "col2") 100 0.684 benchmark(get.3modes.andcounts(df, 'col1', 'col2'), replications = 1000, columns = c("test", "replications", "elapsed")) # test replications elapsed # 1 get.3modes.andcounts(df, "col1", "col2") 1000 6.796
Data:
col2 <- c(4, 4, 4, 4, 5, 3, 3, 3, 2, 2, 6, 6, # group1 2 and 6 tied in the 3rd position, 5 less common 2, 2, 2, 4, 4, 3, 3, 2, 2, 2, 6, 6, # group2 4, 3 and 6 tied in the less common, excluded. 4, 4, 4, 7, 7, 7, 5, 5, 5, 4, 4, 6, # group3 4, 7 and 5 more common, 3 most common present, exclude everything else 4, 4, 4, 4, 5, 5, 5, 5, 2, 2, 6, 6, # group4 2 and 6 less common, excluded (4 AND 5 tied) 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, # group5 6 less common, excluded, (4 and 5 tied) 4, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, # group6 all tied 14,14,14,16,16,16,16,34,34,42,42,42,80,80,84,92) #group7 16, 14, 42 are the three most freq. col1 <- paste(c(rep("group1", 12), rep("group2", 12), rep("group3", 12), rep("group4", 12), rep("group5", 12), rep("group6", 12), rep("group7", 16)), sep = ", ") df <- data.frame(col1=col1,col2=col2)