Overlap in row values ​​from previous rows

I have a dataframe like this:

set.seed(123) 
a <- c("A", "B", "C", "D", "E", "F", "G", "H", "I")
df <- data.frame(
   V1 = sample(a,4, replace=TRUE),
   V2 = sample(a,4, replace=TRUE),
   V3 = sample(a,4, replace=TRUE),
   V4 = sample(a,4, replace=TRUE)
)

which looks like

  V1 V2 V3 V4
1  C  I  E  G
2  H  A  E  F
3  D  E  I  A
4  H  I  E  I

I would like to count the number of unique values ​​in a row compared to the previous rows, so the result would look like this:

  V1 V2 V3 V4 V5
1  C  I  E  G 4
2  H  A  E  F 3
3  D  E  I  A 2
4  H  I  E  I 1

V5 is 4 for line 1, as this is the first line, and they are all unique

V5 is 3 for line 2, since H, A and F were not in line 1

V5 is 2 for line 3, since 1) D and I were not on line 2. and 2) D and A were not on line 1.

V5 is 1 for line 4, since 1) H is not in line 1, 2) I was not in line 2, and 3) H was not in line 4.

if line 4 was HIEA, then V5 for line 4 will still be 1, because it has only 1 value not on line 3, even if it has 2 values ​​not on lines 2 and 2 values ​​not on line 1.

+4
3

R.

# Create a list of the elements by row, using mike H method
myList <-  strsplit(Reduce(paste0, df), "")
# previous method, could create new object first t(df) if large df
# myList <-  split(t(df), col(t(df)))

# get pairwise combinations of rows
combos <- t(combn(nrow(df):1, 2))[choose(nrow(df), 2):1,]

# get desired values, sapply runs through pairs of rows, tapply calculates min with row
df$cnts <- c(length(unique(myList[[1]])), # value for first row
             tapply(sapply(1:nrow(combos), # sapply through pairs, taking set diffs
                           function(x) length(setdiff(myList[[combos[x,1]]],
                                                      myList[[combos[x,2]]]))),
                     combos[,1], min)) # split set diff lengths by row, get min length

df
  V1 V2 V3 V4 cnts
1  C  I  E  G    4
2  H  A  E  F    3
3  D  E  I  A    2
4  H  I  E  I    1
+2

/ "df" :

tab = table(as.matrix(df), row(df)) > 0
#> tab
#   
#        1     2     3     4
#  A FALSE  TRUE  TRUE FALSE
#  C  TRUE FALSE FALSE FALSE
#  D FALSE FALSE  TRUE FALSE
#  E  TRUE  TRUE  TRUE  TRUE
#  F FALSE  TRUE FALSE FALSE
#  G  TRUE FALSE FALSE FALSE
#  H FALSE  TRUE FALSE  TRUE
#  I  TRUE FALSE  TRUE  TRUE

crossprod ( ) , , - :

ct = crossprod(tab, !tab)
#> ct
#   
#    1 2 3 4
#  1 0 3 2 2
#  2 3 0 2 2
#  3 2 2 0 2
#  4 1 1 1 0

, , , 4 1 , 1 , 1 2 , 4 ..

, , , :

ct[upper.tri(ct, TRUE)] = Inf  ## to ignore 'upper.tri' values in 'max.col'

j_min = max.col(-ct, "first")  ## row-index of the minimum difference per row
c(sum(tab[, 1]), 
  ct[cbind(2:nrow(df), j_min[-1])])
#[1] 4 3 2 1
+1

, Reduce mapply:

df$cols_paste <- strsplit(Reduce(paste0, df), split = "")
df$V5 <- lapply(1:length(df$cols_paste), function(x){ 
                                if(x==1) compare = NA
                                else compare = df$cols_paste[seq(1:(x-1))]
                                min(mapply(function(x, y) length(setdiff(x,y)), df$cols_paste[x],  compare))
                                         })

df[,setdiff(names(df), "cols_paste")]
  V1 V2 V3 V4 V5
1  C  I  E  G  4
2  H  A  E  F  3
3  D  E  I  A  2
4  H  I  E  I  1
0
source

Source: https://habr.com/ru/post/1684378/


All Articles