How to compare if any of the elements in a row are the same

Is there a way to compare if "any value" of the string matches the "any value" of the string above - regardless of order? The following is a very random input table.

DT <- data.table(A=c("a","a","b","d","e","f","h","i","j"),
                 B=c("a","b","c","c","f","g",NA,"j",NA),
                 C=c("a","b","c","b","g","h",NA,NA,NA))

> DT
   A  B  C
1: a  a  a
2: a  b  b
3: b  c  c
4: d  c  b
5: e  f  g
6: f  g  h
7: h NA NA
8: i  j NA
9: j NA NA

I would like to add column D, which compares the row with the row above, and compare whether the values ​​of the two rows are identical (regardless of order). Thus, the desired result:

 > DT
   A  B  C  D
1: a  a  a  0 #No row above to compare; could be either NA or 0
2: a  b  b  1 #row 2 has "a", which is in row 1; returns 1
3: b  c  c  1 #row 3 has "b", which is in row 2; returns 1
4: d  c  b  1 #row 4 has "b" and "c", which are in row 3; returns 1
5: e  f  g  0 #row 5 has nothing that is in row 4; returns 0
6: f  g  h  1 #row 6 has "f" and "g", which are in row 5; returns 1
7: h NA NA  1 #row 7 has "h", which is in row 6; returns 1
8: i  j NA  0 #row 8 has nothing that is in row 7 (NA doesn't count)
9: j NA NA  1 #row 9 has "j", which is in row 8; returns 1 (NA doesn't count)

The main idea is that I would like to compare a line (or vector) with another line (vector) and define two lines that will be identical if any of the elements in each line (vector). (without re-comparing each item)

+4
source share
7

data.table :

DT[, id := 1:.N]
dt <- melt(DT, id.vars = "id")
dt[, id2 := id-1]
dt <- dt[!is.na(value)]
idx <- dt[dt, on = .(id2 = id, value), nomatch=0][, unique(id)]
DT[, `:=`(D = as.integer(id %in% idx), id = NULL)]

, id 1 .

+2

, lead , paste , paste ed grepl Map, unlist integer

DT[, D := {
     v1 <- do.call(paste, .SD)
     v2 <- do.call(paste, c(shift(.SD, type = "lead"), sep="|"))
     v2N <- gsub("NA\\|*|\\|*NA", "", v2)
     v3 <- unlist(Map(grepl, v2N, v1), use.names = FALSE)
     as.integer(head(c(FALSE, v3), -1))        
}]

DT
#   A  B  C D
#1: a  a  a 0
#2: a  b  b 1
#3: b  c  c 1
#4: d  c  b 1
#5: e  f  g 0
#6: f  g  h 1
#7: h NA NA 1
#8: i  j NA 0
#9: j NA NA 1

split , Map

as.integer(c(FALSE, unlist(Map(function(x,y) {
     x1 <- na.omit(unlist(x))
     y1 <- na.omit(unlist(y))
    any(x1 %in% y1 | y1 %in% x1)  },
     split(DT[-nrow(DT)], 1:(nrow(DT)-1)), split(DT[-1], 2:nrow(DT))), use.names = FALSE)))
+4

. , data.tables, by=1:nrow(DT), .

DT[, D:= sign(DT[, c(.SD, shift(.SD))][,
   sum(!is.na(intersect(unlist(.SD[, .(A, B, C)]), unlist(.SD[, .(V4, V5, V6)])))),
   by=1:nrow(DT)]$V1)]

[, c(.SD, shift(.SD))] data.frame (cbinded). . NA 0, -NA 1, . . $v1 (0 1) sign.

DT
   A  B  C D
1: a  a  a 0
2: a  b  b 1
3: b  c  c 1
4: d  c  b 1
5: e  f  g 0
6: f  g  h 1
7: h NA NA 1
8: i  j NA 0
9: j NA NA 1
+3

( ) :

compare <- function(i) {
    row1 <- as.character(DT[i,])
    row2 <- as.character(DT[i+1,])
    return(length(intersect(row1[!is.na(row1)], row2[!is.na(row2)])) > 0)
}

result <- sapply(1:(nrow(DT) - 1), compare)

, , , compare as.numeric()

+2

base R, intersect:

res <- c(0, sapply(2:nrow(DT), function(i) 
  length(intersect( na.omit(as.character(DT[i,])), na.omit(as.character(DT[i-1,])) ) )>0))

cbind(DT, D=res)
   # A  B  C D
# 1: a  a  a 0
# 2: a  b  b 1
# 3: b  c  c 1
# 4: d  c  b 1
# 5: e  f  g 0
# 6: f  g  h 1
# 7: h NA NA 1
# 8: i  j NA 0
# 9: j NA NA 1
+2

This solution compares two lines with %in%(after unlist()):

DT[, result:=as.integer(c(NA, sapply(2:DT[,.N], function(i) any(na.omit(unlist(DT[i])) %in% unlist(DT[i-1])))))]
#> DT
#   A  B  C result
#1: a  a  a     NA
#2: a  b  b      1
#3: b  c  c      1
#4: d  c  b      1
#5: e  f  g      0
#6: f  g  h      1
#7: h NA NA      1
#8: i  j NA      0
#9: j NA NA      1
+1
source

Using a combination of intersectand mapply, you can do:

#list of unique elements in each row
tableList = apply(DT,1,function(x) unique(na.omit(x)))

#a lagged list to be compared with above list
tableListLag = c(NA,tableList[2:length(tableList)-1])

#find common elements using intersect function
#if length > 0 implies common elements hence set value as 1 else 0
DT$D = mapply(function(x,y) ifelse(length(intersect(x,y))>0,1,0) ,tableList,tableListLag,
             SIMPLIFY = TRUE)


DT
#   A  B  C D
#1: a  a  a 0
#2: a  b  b 1
#3: b  c  c 1
#4: d  c  b 1
#5: e  f  g 0
#6: f  g  h 1
#7: h NA NA 1
#8: i  j NA 0
#9: j NA NA 1
+1
source

Source: https://habr.com/ru/post/1673604/


All Articles