Using dplyr in for loop in r

I have an example df data, for example:

issue_1  issue_2  issue_3  check  cat_1  cat_2  cat_3
  a          -        -       0     1       0     0 
  -          b        -       1     0       1     0
  -          -        c       1     0       0     1
  p          -        -       0     1       0     0
  -          -        q       1     0       0     1
  -          r        -       0     0       1     0
  a          -        -       1     1       0     0
  a          b        -       1     1       1     0

to explain, it has several issue_1, issue_2, and issue_3 events, and for each value of the check string is either 0 or 1

I need to calculate the total number of events of each value for each issue and the total number of 1 for each value of each problem. So, for this sample for issue_1 we have 3 cases a and 2 cases when a = 1 and one case p and 0 count 1 for p. Similarly for the other two problems.

I used a nested loop, but instead of counting at a grouped level, it gives the total number of rows. Can anyone suggest a better way?

Code example:

abc <- c('issue_1', 'issue_2', 'issue_3')
qwe <- c('cat_1', 'cat_2', 'cat_3')
for(i in abc){
  for(j in qwe){
    temp <- df[, c(i, j, 'check')]
    temp <- subset(temp, temp[[j]] != 0)
    temp <- temp %>%
        group_by(temp[[i]]) %>%
        mutate(total_issue = length(temp[[i]]) %>%
        mutate(check_again = length(check[check == 1])) %>%
        mutate(percentage = (check_again/total_issue)*100)
    temp <- subset(temp, !(duplicated(temp[[i]])))
    temp <- temp[, c(i, 'total_issue', 'check_again', 'percentage')] 
    assign(paste(i, 'stats', sep = '_'), temp)
    write.csv(temp, paste('path', i, j, '_stats', '.csv'))
  }
}

So, for this, for issue_1 and cat_1, it should give:

issue_1   total_issue   check_again   percentage
  a            3             2           2/3*100
  p            1             0           0
0
2

, , , . , melt(), . -. variable value, , ( ) issue, check .

library(reshape2)
library(dplyr)

melt(mydf[,1:4], id.vars = "check") %>%
filter(value != "-") %>%
group_by(variable, value) %>%
summarise(total = n(), check = sum(check), percent = check / total * 100)

#  variable value total check   percent
#    (fctr) (chr) (int) (int)     (dbl)
#1  issue_1     a     3     2  66.66667
#2  issue_1     p     1     0   0.00000
#3  issue_2     b     2     2 100.00000
#4  issue_2     r     1     0   0.00000
#5  issue_3     c     1     1 100.00000
#6  issue_3     q     1     1 100.00000

DATA

mydf <- structure(list(issue_1 = structure(c(2L, 1L, 1L, 3L, 1L, 1L, 
2L, 2L), .Label = c("-", "a", "p"), class = "factor"), issue_2 = structure(c(1L, 
2L, 1L, 1L, 1L, 3L, 1L, 2L), .Label = c("-", "b", "r"), class = "factor"), 
issue_3 = structure(c(1L, 1L, 2L, 1L, 3L, 1L, 1L, 1L), .Label = c("-", 
"c", "q"), class = "factor"), check = c(0L, 1L, 1L, 0L, 1L, 
0L, 1L, 1L), cat_1 = c(1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L), cat_2 = c(0L, 
1L, 0L, 0L, 0L, 1L, 0L, 1L), cat_3 = c(0L, 0L, 1L, 0L, 1L, 
0L, 0L, 0L)), .Names = c("issue_1", "issue_2", "issue_3", 
"check", "cat_1", "cat_2", "cat_3"), class = "data.frame", row.names = c(NA, 
-8L))
+2

dplyr:

library(dplyr)

dfX = read.table(
  textConnection("
issue_1  issue_2  issue_3  check  cat_1  cat_2  cat_3
a          -        -       0     1       0     0 
-          b        -       1     0       1     0
-          -        c       1     0       0     1
p          -        -       0     1       0     0
-          -        q       1     0       0     1
-          r        -       0     0       1     0
a          -        -       1     1       0     0
a          b        -       1     1       1     0
                 "),
  header = TRUE, na.strings = "-", stringsAsFactors = FALSE)


dfX %>%
  group_by(issue_1) %>%
  summarize(total_issues = n(), 
            check_again = sum(check),
            percentage = 100*(check_again/total_issues))
+1

Source: https://habr.com/ru/post/1659900/


All Articles