How to calculate a table of pairwise samples from a frame with a long form

I have a "long" data frame with columns id(primary key) and featureCode(categorical variable). Each entry has 1 to 9 categorical variable values. For instance:

id  featureCode
5   PPLC
5   PCLI
6   PPLC
6   PCLI
7   PPL
7   PPLC
7   PCLI
8   PPLC
9   PPLC
10  PPLC

I would like to calculate the number of times each function code is used with other function codes (the "number in pairs" header). At this stage, the order in which each function code is used is not important. I assume that the result will be a different data frame, where rows and columns are function codes and cells are counts. For instance:

      PPLC  PCLI  PPL
PPLC  0     3     1
PCLI  3     0     1
PPL   1     1     0

Unfortunately, I don’t know how to perform this calculation, and I typed a gap when looking for advice (basically, I suspect because I don’t know the correct terminology).

+3
4

data.table, @mrdwab

, featureCode character

library(data.table)

DT <- data.table(dat)
# convert to character
DT[, featureCode := as.character(featureCode)]
# subset those with >1 per id
DT2 <- DT[, N := .N, by = id][N>1]
# create all combinations of 2
# return as a data.table with these as columns `V1` and `V2`
# then count the numbers in each group
DT2[, rbindlist(combn(featureCode,2, 
      FUN = function(x) as.data.table(as.list(x)), simplify = F)), 
    by = id][, .N, by = list(V1,V2)]


     V1   V2 N
1: PPLC PCLI 3
2:  PPL PPLC 1
3:  PPL PCLI 1
+5

, , :

:

dat <- read.table(header = TRUE, 
       text = "id  featureCode
                5         PPLC
                5         PCLI
                6         PPLC
                6         PCLI
                7          PPL
                7         PPLC
                7         PCLI
                8         PPLC
                9         PPLC
               10         PPLC")

id, featureCode:

dat2 <- dat[ave(dat$id, dat$id, FUN=length) > 1, ]

, lapply .

dat2 <- split(dat2$featureCode, dat2$id)

, , , , .

table(unlist(lapply(dat2, function(x) 
  combn(sort(x), 2, FUN = function(y) 
    paste(y, collapse = "+")))))
# 
#  PCLI+PPL PCLI+PPLC  PPL+PPLC 
#         1         3         1

:

, @flodel . igraph (install.packages("igraph")).

dat2 <- dat[ave(dat$id, dat$id, FUN=length) > 1, ]
dat2 <- split(dat2$featureCode, dat2$id)
library(igraph)
g <- graph.edgelist(matrix(unlist(lapply(dat2, function(x) 
  combn(as.character(x), 2, simplify = FALSE))), ncol = 2, byrow=TRUE), 
                    directed=FALSE)
get.adjacency(g)
# 3 x 3 sparse Matrix of class "dgCMatrix"
#      PPLC PCLI PPL
# PPLC    .    3   1
# PCLI    3    .   1
# PPL     1    1   .
+2

SQL, R sqldf.

:

sqldf("select distinct df1.featureCode, df2.featureCode
       from df df1, df df2       
       ")

:
(, for )

PCLI - PPLC

sqldf("select count(df1.id)
       from df df1, df df2
       where df1.id = df2.id
       and df1.featureCode = 'PCLI' and df2.featureCode = 'PPLC'
       ")

PPLC - PPL

sqldf("select count(df1.id)
       from df df1, df df2
       where df1.id = df2.id
       and df1.featureCode = 'PPLC' and df2.featureCode = 'PPL'
       ")

PCLI - PPL

sqldf("select count(df1.id)
       from df df1, df df2
       where df1.id = df2.id
       and df1.featureCode = 'PCLI' and df2.featureCode = 'PPL'
       ")

, , . , .

+1

, , . , "featureCode". igraph:

dat <- read.table(header = TRUE, stringsAsFactors=FALSE,
                  text = "id  featureCode                                       
                          5         PPLC                                                  
                          5         PCLI                                                  
                          6         PPLC                                                  
                          6         PCLI                                                  
                          7          PPL                                                  
                          7         PPLC                                                  
                          7         PCLI                                                  
                          8         PPLC                                                  
                          9         PPLC                                                  
                         10         PPLC")

g <- graph.data.frame(dat, vertices=unique(data.frame(c(dat[,1], dat[,2]),
                          type=rep(c(TRUE, FALSE), each=nrow(dat)))))

get.adjacency(bipartite.projection(g)[[1]], attr="weight", sparse=FALSE)

#      PPLC PCLI PPL
# PPLC    0    3   1
# PCLI    3    0   1
# PPL     1    1   0
+1

Source: https://habr.com/ru/post/1649849/


All Articles