Find the closest matches for each row and the amount based on the condition

Question

Find the closest matches for each row and the amount based on the condition

Consider the following data. event table:

library(data.table)
breaks <- data.table(id = 1:8,
                     Channel = c("NP1", "NP1", "NP2", "NP2", "NP3", "NP3", "AT4", "AT4"),
                     Time = c(1000, 1100, 975, 1075, 1010, 1080, 1000, 1050),
                     Day = c(1, 1, 1, 1, 1, 1, 1, 1),
                     ZA = c(15, 12, 4, 2, 1, 2, 23, 18),
                     stringsAsFactors = F)

breaks
   id Channel Time Day ZA
1:  1     NP1 1000   1 15
2:  2     NP1 1100   1 12
3:  3     NP2  975   1  4
4:  4     NP2 1075   1  2
5:  5     NP3 1010   1  1
6:  6     NP3 1080   1  2
7:  7     AT4 1000   1 23
8:  8     AT4 1050   1 18

For each unique event in the breaks, I want to find the nearest events in all other channels, using the Timewhere variable Day == Day, and then sum the ZA values for these events.

This is the result I want to achieve:

   id Channel Time Day ZA Sum
1:  1     NP1 1000   1 15  28
2:  2     NP1 1100   1 12  22
3:  3     NP2  975   1  4  39
4:  4     NP2 1075   1  2  32
5:  5     NP3 1010   1  1  42
6:  6     NP3 1080   1  2  32
7:  7     AT4 1000   1 23  20
8:  8     AT4 1050   1 18  19

So, for the first line, channel NP1. Closing events in all other channels before Time = 1000- these are lines 3, 5 and 7.4+1+23 = 28

I got this to work using data.table with the following code:

breaks[breaks[, c("Day", "Time", "Channel", "ZA")], on = "Day", allow.cartesian = TRUE][
  Channel != i.Channel][
    order(id)][
      , delta := abs(Time - i.Time)][
        , .SD[delta == min(delta)], by = .(Channel, Time, Day, i.Channel)][
          , unique(.SD, by = c("id", "i.Channel"))][
            , .(Sum = sum(i.ZA)), by = .(id, Channel, Time, Day, ZA)]

However, this creates a data set with 64 rows in the first step, and I would like to do this with a data set of more than a million rows.

Can someone help me find a more efficient way to do this?

Edit:

. (sqldf), Eddi (data.table) MarkusN (dplyr) 1,4 39 . .

sqldf:      54 minutes
data.table: 11 hours
dplyr:      29 hours

+4

r data.table sqldf

Dahbid 20 . '17 7:58

3

(, ), :

Channels = breaks[, unique(Channel)]
breaks[, Sum := breaks[breaks[row,
                              .(Day, Channel = setdiff(Channels, Channel), Time)],
                       on = .(Day, Channel, Time), roll = 'nearest',
                       sum(ZA)]
       , by = .(row = 1:nrow(breaks))]

, setkey(breaks, Day, Channel, Time) on.

+2

eddi 21 . '17 21:22

, dplyr :

library(dplyr)
breaks %>% 
  inner_join(breaks, by=c("Day"), suffix=c("",".y")) %>%  # self-join
  filter(Channel != Channel.y) %>%                        # ignore events of same channel
  group_by(id, Channel, Time, Day, ZA, Channel.y) %>%     # build group for every event
  arrange(abs(Time - Time.y)) %>%                         # sort by minimal time-diff
  filter(row_number()==1) %>%                             # keep just row with minimal time-diff
  group_by(id, Channel, Time, Day, ZA) %>%                # group by all columns of original event
  summarise(Sum=sum(ZA.y)) %>%                            # sum ZA of other channels
  ungroup() %>% 
  select(id:Sum)

Maybe I should be more specific regarding my answer. Unlike data.table, dplyr has the ability to translate code in sql. Therefore, if your data is stored in a database, you can directly connect to the table associated with your data. All (most) dpylr codes are evaluated in your DBMS. Since joining is a key task of every DBMS, you don’t need to worry about performance.

However, if your data is imported into R and you are worried about RAM limitations, you need to iterate over each row of data. This can be done with dplyr:

library(dplyr)
breaks %>% 
rowwise() %>% 
do({
  row = as_data_frame(.)
  df =
    breaks %>%
    filter(Day == row$Day & Channel != row$Channel) %>% 
    mutate(time_diff = abs(Time-row$Time)) %>% 
    group_by(Channel) %>% 
    arrange(abs(Time-row$Time), .by_group=TRUE) %>% 
    filter(row_number()==1) %>% 
    ungroup() %>% summarise(sum(ZA))

  row %>% mutate(sumZA = df[[1]])
})

0

Markusn 20 sept. '17 at 8:44

source share

G. Grothendieck · Accepted Answer · 2017-09-20T14:53:35+0000

select self-join breaks , , minmum. ZA id .

, , SQLite sqldf, , , , min , , , .

, , , dbname = tempfile() sqldf, . , . . sqldf github.

library(sqldf)

sqldf("select id, Channel, Time, Day, ZA, sum(bZA) Sum
 from (
   select a.*, b.ZA bZA, min(abs(a.Time - b.Time))
   from breaks a join breaks b on a.Day = b.Day and a.Channel != b.Channel
   group by a.id, b.Channel)
 group by id")

:

  id Channel Time Day ZA Sum
1  1     NP1 1000   1 15  28
2  2     NP1 1100   1 12  22
3  3     NP2  975   1  4  39
4  4     NP2 1075   1  2  32
5  5     NP3 1010   1  1  42
6  6     NP3 1080   1  2  32
7  7     AT4 1000   1 23  20
8  8     AT4 1050   1 18  19

, data.table , .

, - ( ) ( ).

library(data.table)
library(dplyr)
library(sqldf)
library(rbenchmark)

benchmark(sqldf = 
sqldf("select id, Channel, Time, Day, ZA, sum(bZA) Sum
 from (
   select a.*, b.ZA bZA, min(abs(a.Time - b.Time))
   from breaks a join breaks b on a.Day = b.Day and a.Channel != b.Channel
   group by a.id, b.Channel)
 group by id"),

data.table = breaks[breaks[, c("Day", "Time", "Channel", "ZA")], on = "Day",
     allow.cartesian = TRUE][
  Channel != i.Channel][
    order(id)][
      , delta := abs(Time - i.Time)][
        , .SD[delta == min(delta)], by = .(Channel, Time, Day, i.Channel)][
          , unique(.SD, by = c("id", "i.Channel"))][
            , .(Sum = sum(i.ZA)), by = .(id, Channel, Time, Day, ZA)],

dplyr = { breaks %>% 
  inner_join(breaks, by=c("Day"), suffix=c("",".y")) %>%
  filter(Channel != Channel.y) %>%
  group_by(id, Channel, Time, Day, ZA, Channel.y) %>%
  arrange(abs(Time - Time.y)) %>%
  filter(row_number()==1) %>%
  group_by(id, Channel, Time, Day, ZA) %>%
  summarise(Sum=sum(ZA.y)) %>%                           
  ungroup() %>% 
  select(id:Sum) },

order = "elapsed")[1:4]

:

        test replications elapsed relative
1      sqldf          100    3.38    1.000
2 data.table          100    4.05    1.198
3      dplyr          100    9.23    2.731

Find the closest matches for each row and the amount based on the condition

More articles: