This is an ideal problem for Rcpp . Note:
#include <Rcpp.h> using namespace Rcpp; // [[Rcpp::export]] IntegerMatrix findConnections(IntegerMatrix m) { int i = 0, j = 0, k = 1, n = m.nrow(); // initialize matrix with same dimensions as m IntegerMatrix myConnections(n, 2); while (i < n) { // Populate with "connected" row myConnections(j,_) = m(i,_); // Search for next connection while (k < n && m(i, 1) != m(k, 0)) {k++;} i = k; j++; } // Subset matrix and output result IntegerMatrix subMatrix(j, 2); for (i = 0; i < j; i++) {subMatrix(i,_) = myConnections(i,_);} return subMatrix; } findConnections(as.matrix(example3)) [,1] [,2] [1,] 1 3 [2,] 3 5 [3,] 5 6 [4,] 6 8
Below are the example3 tests provided by OP:
microbenchmark(get_path(example3), foo(example3), f(example3), findConnections(as.matrix(example3))) Unit: microseconds expr min lq mean median uq max neval cld get_path(example3) 3345.999 3519.0255 6361.76978 3714.014 3892.9930 202511.942 100 b foo(example3) 215.514 239.3230 360.81086 257.180 278.3200 10256.384 100 af(example3) 936.355 1034.4645 1175.60323 1073.668 1142.4270 9676.755 100 a findConnections(as.matrix(example3)) 52.135 60.3445 71.62075 67.528 80.4585 103.858 100 a
Here are a few tests with a larger example (not including get_graph , since it took a lot of time):
set.seed(6221) x <- as.matrix(cbind(x = rnorm(1000, 10, 3), y = rnorm(1000, 10, 3))) value = 5 d <- as.matrix(dist(x[,c("x","y")])) d[lower.tri(d)] <- 0 mtxLarge <- which(d > value, arr.ind = T) mtxLargeFoo <- data.frame(mtxLarge, row.names = NULL)
Equality Check:
a <- findConnections(as.matrix(mtxLarge)) b <- foo(mtxLargeFoo) c <- f(mtxLarge) sapply(1:2, function(x) identical(a[,x], b[,x], c[, x])) [1] TRUE TRUE
UPDATE
If Rcpp not your taste, here is a basic R-translation of the above code, which is still faster than other solutions:
findConnectionsBase <- function(m) { n <- nrow(m) myConnections <- matrix(integer(0), nrow = n, ncol = 2) i <- j <- 1L k <- 2L while (i <= n) { myConnections[j, ] <- m[i, ] while (k <= n && m[i, 2] != m[k, 1]) {k <- k + 1L} i <- k j <- j + 1L } myConnections[!is.na(myConnections[,1]), ] } microbenchmark(get_path(example3), foo(example3), f(example3), BaseR = findConnectionsBase(as.matrix(example3)), Rcpp = findConnections(as.matrix(example3))) Unit: microseconds expr min lq mean median uq max neval cld get_path(example3) 3128.844 3204.3765 6057.18995 3406.137 3849.274 188685.016 100 b foo(example3) 239.734 251.4325 399.71418 267.648 301.309 12455.441 100 af(example3) 899.409 961.3950 1145.72695 1014.555 1127.237 9583.982 100 a BaseR 79.638 89.2850 103.63571 97.905 111.657 212.230 100 a Rcpp 48.850 55.8290 64.24807 61.781 69.170 123.151 100 a
And for a larger example:
microbenchmark(foo(mtxLargeFoo), f(mtxLarge), BaseR = findConnectionsBase(as.matrix(mtxLarge)), Rcpp = findConnections(as.matrix(mtxLarge)), times = 10, unit = "relative") Unit: relative expr min lq mean median uq max neval cld foo(mtxLargeFoo) 2651.9626 2555.0515 1606.2785 1703.0256 1711.4850 671.9115 10 cf(mtxLarge) 6812.7195 6433.2009 3976.6135 4218.1703 4105.1138 1642.2768 10 d BaseR 787.9947 733.4528 440.2043 478.9412 435.4744 167.7491 10 b Rcpp 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 10 a