Symmetric pairs in R

I have a big data frame that looks like this:

> my_table
   track_fid start_gid end_gid
1          1       100      82
2          2        82     100
3          3       100      82
4          4       100      32
5          5        82     100
6          6        82     100
7          7        82     100
8          8       100      82
9          9        34     100
10        10        31     100

My goal is to add a column to_fromto the end and fill it with symbols yor n.

Take the first row as an example - the value is start_gid= 100 and the value is end_gid= 82. If another other row exists anywhere in the table where the values ​​are inverse, i.e. where end_gid= 100 and value at start_gid= 82, I would like to fill in the column of to_fromboth rows y. If the inversion does not exist, the first row should be filled in. The key here is to iterate over each row and look for it in the table in order track_fid. If the opposite is found, where is track_fidgreater, enter y. When the converse receives a value y, it cannot be used again.

For example, this will be an example output:

> output
   track_fid start_gid end_gid to_from
1          1       100      82       y
2          2        82     100       y
3          3       100      82       y
4          4       100      32       n
5          5        82     100       y
6          6        82     100       y
7          7        82     100       n
8          8       100      82       y
9          9        34     100       n
10        10        31     100       n

Is there any way to create such output in R?

Something along the lines of:

for(i in 2:nrow(my_table)) {
if(my_table[i-1,"start_gid"]= my_table[i,"end_gid"]) {
my_table$to_from = "y" } else { my_table$to_from = "n"}


> str(output)
'data.frame':   10 obs. of  4 variables:
 $ track_fid: int  1 2 3 4 5 6 7 8 9 10
 $ start_gid: int  100 82 100 100 82 82 82 100 34 31
 $ end_gid  : int  82 100 82 32 100 100 100 82 100 100
 $ to_from  : Factor w/ 2 levels "n","y": 2 2 2 1 2 2 1 2 1 1
+4
source share
2

R. for next break. Rcpp, .

library(Rcpp)
sourceCpp(code = "
          #include <Rcpp.h>
          // [[Rcpp::export]]
          Rcpp::LogicalVector myfun(const Rcpp::IntegerVector x, const Rcpp::IntegerVector y) {
            Rcpp::LogicalVector res(x.length());
            for (int i=0; i<(x.length()-1); i++) {
              if(res(i)) continue;
              for (int j=i+1; j<x.length(); j++) {
                if (res(j)) continue;
                if (x(i) == y(j) && x(j) == y(i)) {
                   res(i) = true;
                   res(j) = true;
                   break;
                }
              }
            }
            return res;
          }
          ")

DF$from_to <- myfun(DF$start_gid, DF$end_gid)
#   track_fid start_gid end_gid from_to
#1          1       100      82    TRUE
#2          2        82     100    TRUE
#3          3       100      82    TRUE
#4          4       100      32   FALSE
#5          5        82     100    TRUE
#6          6        82     100    TRUE
#7          7        82     100   FALSE
#8          8       100      82    TRUE
#9          9        34     100   FALSE
#10        10        31     100   FALSE
+4

, data.table .

start_gid end_gid:

pairs <- dt[, .N, by = .(start_gid, end_gid)]
pairs

#   start_gid end_gid N
#1:       100      82 3
#2:        82     100 4
#3:       100      32 1
#4:        34     100 1
#5:        31     100 1

, 3 (100, 82) (82, 100) , 4- (82, 100) . , (100, 32), (34, 100) (31, 100) .

nmatch . (100, 82) (82, 100) 82_100. , , nmatch 0.

pairs <- pairs[, .(start_gid, end_gid, nmatch = if (.N <= 1L) 0L else min(N)), 
      by = .(grp = paste(pmin(start_gid, end_gid), pmax(start_gid, end_gid), sep = "_"))]
pairs

#      grp start_gid end_gid nmatch
#1: 82_100       100      82      3
#2: 82_100        82     100      3
#3: 32_100       100      32      0
#4: 34_100        34     100      0
#5: 31_100        31     100      0

. , dt :

out <- pairs[dt, on = .(start_gid, end_gid)]
out
#       grp start_gid end_gid nmatch track_fid
# 1: 82_100       100      82      3         1
# 2: 82_100        82     100      3         2
# 3: 82_100       100      82      3         3
# 4: 32_100       100      32      0         4
# 5: 82_100        82     100      3         5
# 6: 82_100        82     100      3         6
# 7: 82_100        82     100      3         7
# 8: 82_100       100      82      3         8
# 9: 34_100        34     100      0         9
#10: 31_100        31     100      0        10

nmatch , track_fid

out <- out[, .(track_fid, to_from = seq_len(.N) <= nmatch), by = .(start_gid, end_gid)]
out[order(track_fid)]
     start_gid end_gid track_fid to_from
# 1:       100      82         1    TRUE
# 2:        82     100         2    TRUE
# 3:       100      82         3    TRUE
# 4:       100      32         4   FALSE
# 5:        82     100         5    TRUE
# 6:        82     100         6    TRUE
# 7:        82     100         7   FALSE
# 8:       100      82         8    TRUE
# 9:        34     100         9   FALSE
#10:        31     100        10   FALSE

1: (10 )

data.table Rolands Rcpp 10 :

library(microbenchmark)
microbenchmark(
  dt = {
    dt[, .N, by = .(start_gid, end_gid)][
      , .(start_gid, end_gid, nmatch = if (.N <= 1L) 0L else min(N)), 
      by = .(grp = paste(pmin(start_gid, end_gid), pmax(start_gid, end_gid), sep = "_"))][
        dt, on = .(start_gid, end_gid)][
          , .(track_fid, to_from = seq_len(.N) <= nmatch), 
          by = .(start_gid, end_gid)][
            order(track_fid)]
  },
  rcpp_source = {
    sourceCpp(code = "
          #include <Rcpp.h>
          // [[Rcpp::export]]
          Rcpp::LogicalVector myfun(const Rcpp::IntegerVector x, const Rcpp::IntegerVector y) {
            Rcpp::LogicalVector res(x.length());
            for (int i=0; i<(x.length()-1); i++) {
              if(res(i)) continue;
              for (int j=i+1; j<x.length(); j++) {
                if (res(j)) continue;
                if (x(i) == y(j) && x(j) == y(i)) {
                   res(i) = true;
                   res(j) = true;
                   break;
                }
              }
            }
            return res;
          }
          ")
    dt$from_to <- myfun(dt$start_gid, dt$end_gid)
    dt
  },
  rcpp_func = {
    dt$from_to <- myfun(dt$start_gid, dt$end_gid)
    dt
  }
)

Unit: microseconds
        expr      min       lq      mean    median       uq       max neval
          dt 2873.017 3233.418 3466.5484 3408.0495 3558.705  6345.633   100
 rcpp_source 8112.335 8537.114 8932.8953 8811.2385 9173.150 12093.931   100
   rcpp_func  101.192  121.582  142.0769  137.4405  154.620   255.246   100

, Rcpp 20 , data.table ( ). , sourceCPP, data.table.

, data.table data.table .

2:

@Roland, data.table Rcpp : enter image description here

1000 Rcpp , data.table. data.table , Rcpp. , Rcpp .

+4

Source: https://habr.com/ru/post/1665291/


All Articles