Save what (..., arr.ind = TRUE) the results that connect

I am trying to take the results of the function which(..., arr.ind = TRUE) and delete rows that are not the first ones that "connect" to each other.

<strong> Examples:

 #example 1 example 2 example 3 row col row col row col 1 4 2 3 1 3 2 4 2 4 2 5 4 5 3 5 3 5 3 6 2 7 4 6 4 6 3 7 5 6 3 7 4 7 6 8 4 7 5 7 9 10 # should become (trimmed.mtx) row col row col row col 1 4 2 3 1 3 4 5 3 5 3 5 5 7 5 6 6 8 

These examples can be read using:

 example1 <- structure(list(row = c(1L, 2L, 4L, 3L, 4L, 3L, 4L), col = c(4L, 4L, 5L, 6L, 6L, 7L, 7L)), .Names = c("row", "col"), class = "data.frame", row.names = c(NA, -7L)) example2 <- structure(list(row = c(2L, 2L, 3L, 2L, 3L, 4L, 5L), col = c(3L, 4L, 5L, 7L, 7L, 7L, 7L)), .Names = c("row", "col"), class = "data.frame", row.names = c(NA, -7L)) example3 <- structure(list(row = c(1L, 2L, 3L, 4L, 5L, 6L, 9L), col = c(3L, 5L, 5L, 6L, 6L, 8L, 10L)), .Names = c("row", "col"), class = "data.frame", row.names = c(NA, -7L)) 

The goal is to take a dist-table of Euclidean distances and turn it into a sequence of point-to-point distances passing distances below a certain threshold. Although there may be other ways to solve this problem, I’m very interested to figure out how to do this by filtering out rows from this matrix.

A reproducible example of my intended use:

 set.seed(81417) # Aug 14th, 2017 # Generate fake location data (temporally sequential) x <- as.matrix(cbind(x = rnorm(10, 10, 3), y = rnorm(10, 10, 3))) # Find euclidean point-to-point distances and remove distances that are less than: value = 5 # I attempted to do so by calculating an entire Euclidean distance matrix (dist()) # and then finding a path from point-to-nearest-point # using distances that are greater than the value d <- as.matrix(dist(x[,c("x","y")])) d[lower.tri(d)] <- 0 mtx <- which(d > value, arr.ind = T) mtx # Change from EVERY point-to-point distance (mtx) > value # to only the "connecting" points that exceed the skipping value trimmed.mtx <- {?} # final result cbind(x[unique(c(trimmed.mtx)),],d[trimmed.mtx]) 

Please let me know if I can provide more information.

+5
source share
5 answers

This is an ideal problem for Rcpp . Note:

 #include <Rcpp.h> using namespace Rcpp; // [[Rcpp::export]] IntegerMatrix findConnections(IntegerMatrix m) { int i = 0, j = 0, k = 1, n = m.nrow(); // initialize matrix with same dimensions as m IntegerMatrix myConnections(n, 2); while (i < n) { // Populate with "connected" row myConnections(j,_) = m(i,_); // Search for next connection while (k < n && m(i, 1) != m(k, 0)) {k++;} i = k; j++; } // Subset matrix and output result IntegerMatrix subMatrix(j, 2); for (i = 0; i < j; i++) {subMatrix(i,_) = myConnections(i,_);} return subMatrix; } findConnections(as.matrix(example3)) [,1] [,2] [1,] 1 3 [2,] 3 5 [3,] 5 6 [4,] 6 8 

Below are the example3 tests provided by OP:

 microbenchmark(get_path(example3), foo(example3), f(example3), findConnections(as.matrix(example3))) Unit: microseconds expr min lq mean median uq max neval cld get_path(example3) 3345.999 3519.0255 6361.76978 3714.014 3892.9930 202511.942 100 b foo(example3) 215.514 239.3230 360.81086 257.180 278.3200 10256.384 100 af(example3) 936.355 1034.4645 1175.60323 1073.668 1142.4270 9676.755 100 a findConnections(as.matrix(example3)) 52.135 60.3445 71.62075 67.528 80.4585 103.858 100 a 

Here are a few tests with a larger example (not including get_graph , since it took a lot of time):

 set.seed(6221) x <- as.matrix(cbind(x = rnorm(1000, 10, 3), y = rnorm(1000, 10, 3))) value = 5 d <- as.matrix(dist(x[,c("x","y")])) d[lower.tri(d)] <- 0 mtxLarge <- which(d > value, arr.ind = T) mtxLargeFoo <- data.frame(mtxLarge, row.names = NULL) ## this is for the function foo ## as we don't want to include ## the time it takes to create ## a data.frame every time. microbenchmark(foo(mtxLargeFoo), f(mtxLarge), findConnections(as.matrix(mtxLarge)), times = 10, unit = "relative") Unit: relative expr min lq mean median uq max neval cld foo(mtxLargeFoo) 3168.479 3376.909 2660.377 3424.276 2319.434 1960.161 10 bf(mtxLarge) 8307.009 8436.569 6420.919 8319.151 5184.557 4610.922 10 c findConnections(as.matrix(mtxLarge)) 1.000 1.000 1.000 1.000 1.000 1.000 10 a 

Equality Check:

 a <- findConnections(as.matrix(mtxLarge)) b <- foo(mtxLargeFoo) c <- f(mtxLarge) sapply(1:2, function(x) identical(a[,x], b[,x], c[, x])) [1] TRUE TRUE 


UPDATE
If Rcpp not your taste, here is a basic R-translation of the above code, which is still faster than other solutions:

 findConnectionsBase <- function(m) { n <- nrow(m) myConnections <- matrix(integer(0), nrow = n, ncol = 2) i <- j <- 1L k <- 2L while (i <= n) { myConnections[j, ] <- m[i, ] while (k <= n && m[i, 2] != m[k, 1]) {k <- k + 1L} i <- k j <- j + 1L } myConnections[!is.na(myConnections[,1]), ] } microbenchmark(get_path(example3), foo(example3), f(example3), BaseR = findConnectionsBase(as.matrix(example3)), Rcpp = findConnections(as.matrix(example3))) Unit: microseconds expr min lq mean median uq max neval cld get_path(example3) 3128.844 3204.3765 6057.18995 3406.137 3849.274 188685.016 100 b foo(example3) 239.734 251.4325 399.71418 267.648 301.309 12455.441 100 af(example3) 899.409 961.3950 1145.72695 1014.555 1127.237 9583.982 100 a BaseR 79.638 89.2850 103.63571 97.905 111.657 212.230 100 a Rcpp 48.850 55.8290 64.24807 61.781 69.170 123.151 100 a 

And for a larger example:

 microbenchmark(foo(mtxLargeFoo), f(mtxLarge), BaseR = findConnectionsBase(as.matrix(mtxLarge)), Rcpp = findConnections(as.matrix(mtxLarge)), times = 10, unit = "relative") Unit: relative expr min lq mean median uq max neval cld foo(mtxLargeFoo) 2651.9626 2555.0515 1606.2785 1703.0256 1711.4850 671.9115 10 cf(mtxLarge) 6812.7195 6433.2009 3976.6135 4218.1703 4105.1138 1642.2768 10 d BaseR 787.9947 733.4528 440.2043 478.9412 435.4744 167.7491 10 b Rcpp 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 10 a 
+4
source

Function

 foo = function(df){ #Initiate with a value of 1 (first row) inds = 1 while(TRUE){ # Look for the first index where the 'row' is equal to the value # in 'col' at the index specified by the last value of 'inds' temp = tail(inds, 1) ind = temp + which(df[["row"]][(temp+1):NROW(df)] == df[["col"]][temp])[1] #Append 'ind' to 'inds' inds = c(inds, ind) #Iterate until the end of the rows or when NA is encountered if (ind == NROW(df) | is.na(ind)){ #Return the subset of the df with appropirate rows return(df[inds[!is.na(inds)],]) } } } 

USING

 foo(example1) # row col #1 1 4 #3 4 5 foo(example2) # row col #1 2 3 #3 3 5 #7 5 7 foo(example3) # row col #1 1 3 #3 3 5 #5 5 6 #6 6 8 foo(data.frame(mtx, row.names = NULL)) # row col #1 1 3 #5 3 4 #11 4 7 
+3
source

Here is an idea using the igraph package with zoo ,

 get_path <- function(df){ g1 <- graph_from_data_frame(df) l1 <- all_simple_paths(g1, 1) ind1 <- as.numeric(names(l1[[which.max(lengths(l1))]])) final_df <- setNames(as.data.frame(rollapply(ind1, 2, c)), c('row', 'col')) return(final_df) } 

which gives the following:

 library(igraph) library(zoo) 
  get_path(example1) row col 1 1 4 2 4 5 get_path(example2) row col 1 2 3 2 3 5 3 5 7 get_path(example3) row col 1 1 3 2 3 5 3 5 6 4 6 8 
+3
source

This function is applicable for matrices and data.frames with two columns.

 f <- function(x){ res <- x[1, ] # first row as defined tmpCol <- x[1,2] # the target column for the "connection" while (TRUE){ # loop until breaked connectingRow <- x[which(x[, 1] == tmpCol)[1], ] # get first matching row if (any(is.na(connectingRow))) return(res) # if this row is not NA (which it would be if no connecting line is found) continue, # else return the results # append connecting matches and set new tmpCol for reiteration. res <- rbind(res, connectingRow) tmpCol <- res[nrow(res), 2] } } f(example1) # row col # 1 1 4 # 3 4 5 f(example2) # row col # 1 2 3 # 3 3 5 # 7 5 7 

Benchmarking

Comparison between @db foo() and f() suggested above

 microbenchmark(f(mtx), foo(mtx)) # Unit: microseconds # expr min lq mean median uq max neval cld # f(mtx) 18.204 19.058 22.61003 20.053 20.7640 64.851 100 a # foo(mtx) 14.506 15.075 73.97871 15.360 15.9285 5740.151 100 a 
+2
source

For all of you fans of functional programming, here is a recursive solution. R is not optimized for this, but it most accurately represents the abstract process that the OP describes.

 connected_rows <- function(df, next.row.val = NULL){ if(is.null(next.row.val)){ return( rbind( head(df,1), Recall( df = tail(df,-1), next.row.val = head(df$col,1) ) ) ) } else { next.row <- match(next.row.val,df$row) if(is.na(next.row)){ return(NULL) } else { return( rbind( df[next.row,], Recall( df = tail(df,-next.row), next.row.val = df$col[next.row] ) ) ) } } } connected_rows(example1) # row col # 1 1 4 # 3 4 5 connected_rows(example2) # row col # 1 2 3 # 3 3 5 # 7 5 7 connected_rows(example3) # row col # 1 1 3 # 3 3 5 # 5 5 6 # 6 6 8 
+2
source

Source: https://habr.com/ru/post/1270727/


All Articles