Statistics of several identically named columns

I have a huge set of data with multiple columns, such as x1, x2, x3...... x25, y1, y2, y3...... y50, z1, z2....... z10etc, which looks something like this:

x1  x2  x3  x4  y1  y2  y3  
1   2   1   2   1   1   2   
2   1   1   1   3   1   1
1   2   2   1   1   2   1

I want to:

x_mean  x_min  x_max  x_mad  y_mean  y_min  y_max  y_mad
  1.5     1      2     0.74    2       1      2      0 
  1.25    1      2       0     2       1      2      0 
  1.5     1      2     0.74    2       1      2      0 

Basically, I need to calculate min, max, mad(mean absolute deviation) andmean

> x_mean = (x1+x2+x3+x4)/4 = (1+2+1+2)/4 

and similarly for other lines and other characteristics. How to do this in R, preferably in dplyr?

+4
source share
3 answers

, summarise summarise_all, funs . , tidy ( ), :

library(tidyverse)

df <- read.table(text = 'x1  x2  x3  x4  y1  y2  y3  
1   2   1   2   1   1   2   
2   1   1   1   3   1   1
1   2   2   1   1   2   1', head = TRUE)

df_tidy <- df %>% 
    mutate(row = row_number()) %>%       # keep position info
    gather(var, val, -row) %>%           # reshape to long
    mutate(var = sub('\\d', '', var))    # extract letters from former colnames

df_summary <- df_tidy %>%
    group_by(var, row) %>%                # group by variable and original row
    summarise_all(funs(min, max, mad))    # summarize with various functions

df_summary
#> # A tibble: 6 x 5
#> # Groups:   var [?]
#>     var   row   min   max    mad
#>   <chr> <int> <dbl> <dbl>  <dbl>
#> 1     x     1     1     2 0.7413
#> 2     x     2     1     2 0.0000
#> 3     x     3     1     2 0.7413
#> 4     y     1     1     2 0.0000
#> 5     y     2     1     3 0.0000
#> 6     y     3     1     2 0.0000

, , -, , .

+6

, tidyverse, R,

#create a function to do whatever calculations needed,
f1 <- function(d){
  mean1 <- rowMeans(d)
  min1 <- do.call(pmin, d)
  max1 <- do.call(pmax, d)
  mad1 <- apply(d, 1, mad)
  return(data.frame(mean1, min1, max1, mad1))
}

#apply it to your data frame based on the unique name letters,
ind <- unique(sub('\\d+', '', names(df)))
setNames(lapply(ind, function(i) f1(df[grepl(i, names(df))])), ind)
#$x
#  mean1 min1 max1   mad1
#1  1.50    1    2 0.7413
#2  1.25    1    2 0.0000
#3  1.50    1    2 0.7413

#$y
#     mean1 min1 max1 mad1
#1 1.333333    1    2    0
#2 1.666667    1    3    0
#3 1.333333    1    2    0

do.call(cbind, setNames(lapply(ind, function(i) f1(df[grepl(i, names(df))])), ind))
#  x.mean1 x.min1 x.max1 x.mad1  y.mean1 y.min1 y.max1 y.mad1
#1    1.50      1      2 0.7413 1.333333      1      2      0
#2    1.25      1      2 0.0000 1.666667      1      3      0
#3    1.50      1      2 0.7413 1.333333      1      2      0
+2

row.. matrixStats

library(matrixStats)
do.call(cbind, lapply(split.default(df1, sub("\\d+", "", names(df1))), function(x) {
   x1 <- as.matrix(x)
  data.frame(mean = rowMeans(x1), min = rowMins(x1), max = rowMaxs(x1), mad = rowMads(x1))}))
#  x.mean x.min x.max  x.mad   y.mean y.min y.max y.mad
#1   1.50     1     2 0.7413 1.333333     1     2     0
#2   1.25     1     2 0.0000 1.666667     1     3     0
#3   1.50     1     2 0.7413 1.333333     1     2     0
+1

Source: https://habr.com/ru/post/1679523/


All Articles