Borrowing (possibly unrealistic) data from a fool :
library(data.table) size <- 118000000 key1 <- sample( LETTERS, size, replace=TRUE, prob=runif(length(LETTERS), 0.0, 5.0) ) key2 <- sample( LETTERS, size, replace=TRUE, prob=runif(length(LETTERS), 0.0, 5.0) ) val <- runif(size, 0.0, 5.0) dt <- data.table(key1, key2, val, stringsAsFactors=FALSE)
Here is a quick way if your data looks like this:
# eddi answer system.time(res1 <- dt[dt[, .I[1], by=.(pmin(key1, key2), pmax(key1, key2))]$V1]) # user system elapsed # 101.79 3.01 107.98 # optimized for this data system.time({ dt2 <- unique(dt, by=c("key1", "key2"))[key1 > key2, c("key1", "key2") := .(key2, key1)] res2 <- unique(dt2, by=c("key1", "key2")) }) # user system elapsed # 8.50 1.16 4.93 fsetequal(copy(res1)[key1 > key2, c("key1", "key2") := .(key2, key1)], res2) # [1] TRUE
Such data seems unlikely if they are covariance, since you should have no more than one duplicate (i.e. AB with BA).