Find the closest matches for each row and the amount based on the condition

Consider the following data. event table:

library(data.table)
breaks <- data.table(id = 1:8,
                     Channel = c("NP1", "NP1", "NP2", "NP2", "NP3", "NP3", "AT4", "AT4"),
                     Time = c(1000, 1100, 975, 1075, 1010, 1080, 1000, 1050),
                     Day = c(1, 1, 1, 1, 1, 1, 1, 1),
                     ZA = c(15, 12, 4, 2, 1, 2, 23, 18),
                     stringsAsFactors = F)

breaks
   id Channel Time Day ZA
1:  1     NP1 1000   1 15
2:  2     NP1 1100   1 12
3:  3     NP2  975   1  4
4:  4     NP2 1075   1  2
5:  5     NP3 1010   1  1
6:  6     NP3 1080   1  2
7:  7     AT4 1000   1 23
8:  8     AT4 1050   1 18

For each unique event in the breaks, I want to find the nearest events in all other channels, using the Timewhere variable Day == Day, and then sum the ZA values ​​for these events.

This is the result I want to achieve:

   id Channel Time Day ZA Sum
1:  1     NP1 1000   1 15  28
2:  2     NP1 1100   1 12  22
3:  3     NP2  975   1  4  39
4:  4     NP2 1075   1  2  32
5:  5     NP3 1010   1  1  42
6:  6     NP3 1080   1  2  32
7:  7     AT4 1000   1 23  20
8:  8     AT4 1050   1 18  19

So, for the first line, channel NP1. Closing events in all other channels before Time = 1000- these are lines 3, 5 and 7.4+1+23 = 28

I got this to work using data.table with the following code:

breaks[breaks[, c("Day", "Time", "Channel", "ZA")], on = "Day", allow.cartesian = TRUE][
  Channel != i.Channel][
    order(id)][
      , delta := abs(Time - i.Time)][
        , .SD[delta == min(delta)], by = .(Channel, Time, Day, i.Channel)][
          , unique(.SD, by = c("id", "i.Channel"))][
            , .(Sum = sum(i.ZA)), by = .(id, Channel, Time, Day, ZA)]

However, this creates a data set with 64 rows in the first step, and I would like to do this with a data set of more than a million rows.

Can someone help me find a more efficient way to do this?

Edit:

. (sqldf), Eddi (data.table) MarkusN (dplyr) 1,4 39 . .

sqldf:      54 minutes
data.table: 11 hours
dplyr:      29 hours
+4
3

select self-join breaks , , minmum. ZA id .

, , SQLite sqldf, , , , min , , , .

, , , dbname = tempfile() sqldf, . , . . sqldf github.

library(sqldf)

sqldf("select id, Channel, Time, Day, ZA, sum(bZA) Sum
 from (
   select a.*, b.ZA bZA, min(abs(a.Time - b.Time))
   from breaks a join breaks b on a.Day = b.Day and a.Channel != b.Channel
   group by a.id, b.Channel)
 group by id")

:

  id Channel Time Day ZA Sum
1  1     NP1 1000   1 15  28
2  2     NP1 1100   1 12  22
3  3     NP2  975   1  4  39
4  4     NP2 1075   1  2  32
5  5     NP3 1010   1  1  42
6  6     NP3 1080   1  2  32
7  7     AT4 1000   1 23  20
8  8     AT4 1050   1 18  19

, data.table , .

, - ( ) ( ).

library(data.table)
library(dplyr)
library(sqldf)
library(rbenchmark)

benchmark(sqldf = 
sqldf("select id, Channel, Time, Day, ZA, sum(bZA) Sum
 from (
   select a.*, b.ZA bZA, min(abs(a.Time - b.Time))
   from breaks a join breaks b on a.Day = b.Day and a.Channel != b.Channel
   group by a.id, b.Channel)
 group by id"),

data.table = breaks[breaks[, c("Day", "Time", "Channel", "ZA")], on = "Day",
     allow.cartesian = TRUE][
  Channel != i.Channel][
    order(id)][
      , delta := abs(Time - i.Time)][
        , .SD[delta == min(delta)], by = .(Channel, Time, Day, i.Channel)][
          , unique(.SD, by = c("id", "i.Channel"))][
            , .(Sum = sum(i.ZA)), by = .(id, Channel, Time, Day, ZA)],

dplyr = { breaks %>% 
  inner_join(breaks, by=c("Day"), suffix=c("",".y")) %>%
  filter(Channel != Channel.y) %>%
  group_by(id, Channel, Time, Day, ZA, Channel.y) %>%
  arrange(abs(Time - Time.y)) %>%
  filter(row_number()==1) %>%
  group_by(id, Channel, Time, Day, ZA) %>%
  summarise(Sum=sum(ZA.y)) %>%                           
  ungroup() %>% 
  select(id:Sum) },

order = "elapsed")[1:4]

:

        test replications elapsed relative
1      sqldf          100    3.38    1.000
2 data.table          100    4.05    1.198
3      dplyr          100    9.23    2.731
+2

(, ), :

Channels = breaks[, unique(Channel)]
breaks[, Sum := breaks[breaks[row,
                              .(Day, Channel = setdiff(Channels, Channel), Time)],
                       on = .(Day, Channel, Time), roll = 'nearest',
                       sum(ZA)]
       , by = .(row = 1:nrow(breaks))]

, setkey(breaks, Day, Channel, Time) on.

+2

, dplyr :

library(dplyr)
breaks %>% 
  inner_join(breaks, by=c("Day"), suffix=c("",".y")) %>%  # self-join
  filter(Channel != Channel.y) %>%                        # ignore events of same channel
  group_by(id, Channel, Time, Day, ZA, Channel.y) %>%     # build group for every event
  arrange(abs(Time - Time.y)) %>%                         # sort by minimal time-diff
  filter(row_number()==1) %>%                             # keep just row with minimal time-diff
  group_by(id, Channel, Time, Day, ZA) %>%                # group by all columns of original event
  summarise(Sum=sum(ZA.y)) %>%                            # sum ZA of other channels
  ungroup() %>% 
  select(id:Sum)

Maybe I should be more specific regarding my answer. Unlike data.table, dplyr has the ability to translate code in sql. Therefore, if your data is stored in a database, you can directly connect to the table associated with your data. All (most) dpylr codes are evaluated in your DBMS. Since joining is a key task of every DBMS, you don’t need to worry about performance.

However, if your data is imported into R and you are worried about RAM limitations, you need to iterate over each row of data. This can be done with dplyr:

library(dplyr)
breaks %>% 
rowwise() %>% 
do({
  row = as_data_frame(.)
  df =
    breaks %>%
    filter(Day == row$Day & Channel != row$Channel) %>% 
    mutate(time_diff = abs(Time-row$Time)) %>% 
    group_by(Channel) %>% 
    arrange(abs(Time-row$Time), .by_group=TRUE) %>% 
    filter(row_number()==1) %>% 
    ungroup() %>% summarise(sum(ZA))

  row %>% mutate(sumZA = df[[1]])
})
0
source

Source: https://habr.com/ru/post/1686019/


All Articles