Get the percentage of people for each pair

I have a puzzleclient data frame and the type of item to which they belong. A client can appear several times in the list if it has several elements.

name    type
m1       A
m10      A
m2       A
m9       A
m9       B
m4       B
m5       B
m1       C
m2       C
m3       C
m4       C
m5       C
m6       C
m7       C
m8       C
m1       D
m5       D

I would like to calculate what percentage of people who own "A" also have "B", etc.

Based on the above input, how can I get this output using R:

    A     B      C      D      TOTAL
A   1     0.25   0.5    0.25    4
B   0.33  1      0.67   0.33    3
C   0.25  0.25   1      0.25    8
D   0.5   0.5    1      1       2

Many thanks for your help!


Here is a long and manual way to do this, without any loops or advanced functions (but, of course, this is the lost potential in R):

Example for element A: -

puzzleA <- subset(puzzle, type == 'A')

Calculation of customers who own A, who also own B: -

length(unique((merge(puzzleA, puzzleB, by = 'name'))$name))/length(unique(puzzleA$name)

Data

puzzle <- structure(list(name = c("m1", "m10", "m2", "m9", "m9", "m4", 
          "m5", "m1", "m2", "m3", "m4", "m5", "m6", "m7", "m8", "m1", "m5"
          ), type = c("A", "A", "A", "A", "B", "B", "B", "C", "C", "C", 
          "C", "C", "C", "C", "C", "D", "D")), .Names = c("name", "type"
          ), class = "data.frame", row.names = c(NA, -17L))
+4
source share
4 answers

, :

library(arules)
trans <- as(lapply(split(puzzle[2], puzzle[1]), unlist, F, F), "transactions")
rules <- apriori(trans, parameter = list(support=0, minlen=2, maxlen=2, conf=0))
res <- data.frame(
  lhs = labels(lhs(rules)), 
  rhs = labels(rhs(rules)), 
  value = round(rules@quality$confidence, 2)
)
res <- reshape2::dcast(res, lhs~rhs, fill = 1)
res$total <- rowSums(trans@data)
res
#   lhs  {A}  {B}  {C}  {D} total
# 1 {A} 1.00 0.25 0.50 0.25     4
# 2 {B} 0.33 1.00 0.67 0.33     3
# 3 {C} 0.25 0.25 1.00 0.25     8
# 4 {D} 0.50 0.50 1.00 1.00     2 
+3

merge/table. merge by "", , table ('tbl'), 'tbl' cbind .

tbl <- table(merge(puzzle, puzzle, by = "name")[-1])
cbind(round(tbl/diag(tbl),2), TOTAL= diag(tbl))
#     A    B    C    D TOTAL
#A 1.00 0.25 0.50 0.25     4
#B 0.33 1.00 0.67 0.33     3
#C 0.25 0.25 1.00 0.25     8
#D 0.50 0.50 1.00 1.00     2
+3

Akrun, , . .

tab <- crossprod(table(puzzle))
cbind(tab / diag(tab), total=diag(tab))

#           A    B         C         D total
# A 1.0000000 0.25 0.5000000 0.2500000     4
# B 0.3333333 1.00 0.6666667 0.3333333     3
# C 0.2500000 0.25 1.0000000 0.2500000     8
# D 0.5000000 0.50 1.0000000 1.0000000     2
+3

: %in% .

## separate out people by type
lst <- with(puzzle, split(name, type))

#List of 4
# $ A: chr [1:4] "m1" "m10" "m2" "m9"
# $ B: chr [1:3] "m9" "m4" "m5"
# $ C: chr [1:8] "m1" "m2" "m3" "m4" ...
# $ D: chr [1:2] "m1" "m5"

## pairwise intersect (a matrix of list)
pair_intersect <- outer(lst, lst, Vectorize(intersect))

#  A           B           C           D          
#A Character,4 "m9"        Character,2 "m1"       
#B "m9"        Character,3 Character,2 "m5"       
#C Character,2 Character,2 Character,8 Character,2
#D "m1"        "m5"        Character,2 Character,2

## count number of people in each pair
count <- matrix(lengths(pair_intersect), nrow = length(lst),
                dimnames = dimnames(pair_intersect))

#  A B C D
#A 4 1 2 1
#B 1 3 2 1
#C 2 2 8 2
#D 1 1 2 2

## conditional percentage
conditional_percent <- count / diag(count)

#          A    B         C         D
#A 1.0000000 0.25 0.5000000 0.2500000
#B 0.3333333 1.00 0.6666667 0.3333333
#C 0.2500000 0.25 1.0000000 0.2500000
#D 0.5000000 0.50 1.0000000 1.0000000

,

final <- cbind(conditional_percent, Total = diag(count))

#          A    B         C         D Total
#A 1.0000000 0.25 0.5000000 0.2500000     4
#B 0.3333333 1.00 0.6666667 0.3333333     3
#C 0.2500000 0.25 1.0000000 0.2500000     8
#D 0.5000000 0.50 1.0000000 1.0000000     2
+2
source

Source: https://habr.com/ru/post/1654973/


All Articles