How to make a fuzzy connection with fuzzyjoin :: difference_ * in R

I work with two different datasets that I want to combine on a threshold basis. Let's say two data blocks look like this:

library(dplyr)
library(fuzzyjoin)
library(lubridate)

df1 = data_frame(Item=1:5, 
                 DateTime=c("2015-01-01 11:12:14", "2015-01-02 09:15:23", 
                            "2015-01-02 15:46:11", "2015-04-19 22:11:33", 
                            "2015-06-10 07:00:00"), 
                 Count=c(1, 6, 11, 15, 9), 
                 Name="Sterling", 
                 Friend=c("Pam", "Cyril", "Cheryl", "Mallory", "Lana"))
df1$DateTime = ymd_hms(df1$DateTime)

df2 = data_frame(Item=21:25, 
                 DateTime=c("2015-01-01 11:12:15", "2015-01-02 19:15:23", 
                            "2015-01-02 15:46:11", "2015-05-19 22:11:33", 
                            "2015-06-10 07:00:02"), 
                 Count=c(3, 7, 11, 15, 8), 
                 Name="Sterling", 
                 Friend=c("Pam", "Kreger", "Woodhouse", "Gillete", "Lana"))
df2$DateTime = ymd_hms(df2$DateTime)

Now I would like to leave the left connection df2with df1based on fuzzy matching DateTimeand Countwithin two seconds of their respective values, while all other values ​​except are Itemidentical. I thought I could get there with the following:

df1 %>%
  difference_left_join(df2, by=c("DateTime", "Count"), max_dist=2)

But this gives me the following result:

 # A tibble: 8 × 10
  Item.x          DateTime.x Count.x   Name.x Friend.x Item.y          DateTime.y Count.y   Name.y  Friend.y
   <int>              <dttm>   <dbl>    <chr>    <chr>  <int>              <dttm>   <dbl>    <chr>     <chr>
1      1 2015-01-01 11:12:14       1 Sterling      Pam     21 2015-01-01 11:12:15       3 Sterling       Pam
2      1 2015-01-01 11:12:14       1 Sterling      Pam     21 2015-01-01 11:12:15       3 Sterling       Pam
3      2 2015-01-02 09:15:23       6 Sterling    Cyril     NA                <NA>      NA     <NA>      <NA>
4      3 2015-01-02 15:46:11      11 Sterling   Cheryl     23 2015-01-02 15:46:11      11 Sterling Woodhouse
5      3 2015-01-02 15:46:11      11 Sterling   Cheryl     23 2015-01-02 15:46:11      11 Sterling Woodhouse
6      4 2015-04-19 22:11:33      15 Sterling  Mallory     NA                <NA>      NA     <NA>      <NA>
7      5 2015-06-10 07:00:00       9 Sterling     Lana     25 2015-06-10 07:00:02       8 Sterling      Lana
8      5 2015-06-10 07:00:00       9 Sterling     Lana     25 2015-06-10 07:00:02       8 Sterling      Lana

This is close, except that line 3 should not be combined, given that the names are different (and I would expect line 2 to be merged based on threshold values, although I do not want this).

? , df2 , , DateTime Count . , ( Item) .

desired_output
#   Item            DateTime Count     Name  Friend
# 1    3 2015-01-02 15:46:11    11 Sterling  Cheryl
# 2    2 2015-01-02 09:15:23     6 Sterling   Cyril
# 3    5 2015-06-10 07:00:00     9 Sterling    Lana
# 4   25 2015-06-10 07:00:02     8 Sterling    Lana
# 5    4 2015-04-19 22:11:33    15 Sterling Mallory
# 6    1 2015-01-01 11:12:14     1 Sterling     Pam
# 7   21 2015-01-01 11:12:15     3 Sterling     Pam
+4
1

, , , , .

, . , :

library(dplyr)
library(fuzzyjoin)
library(lubridate)

df1 = data_frame(Item=1:5, 
                 DateTime=c("2015-01-01 11:12:14", "2015-01-02 09:15:23", 
                            "2015-01-02 15:46:11", "2015-04-19 22:11:33", 
                            "2015-06-10 07:00:00"), 
                 Count=c(1, 6, 11, 15, 9), 
                 Name="Sterling", 
                 Friend=c("Pam", "Cyril", "Cheryl", "Mallory", "Lana"))
df1$DateTime1 = as.numeric(seconds(ymd_hms(df1$DateTime)))

df2 = data_frame(Item=21:25, 
                 DateTime=c("2015-01-01 11:12:15", "2015-01-02 19:25:56", 
                            "2015-01-02 15:46:11", "2015-05-19 22:11:33", 
                            "2015-06-10 07:00:02"), 
                 Count=c(3, 6, 11, 15, 8), 
                 Name="Sterling", 
                 Friend=c("Pam", "Kreger", "Woodhouse", "Gillete", "Lana"))
df2$DateTime1 = as.numeric(seconds(ymd_hms(df2$DateTime)))

df1 %>%
  difference_left_join(y=df2, by=c("DateTime1", "Count"), max_dist=2)

, :

df1[df2$Friend == df1$Friend,] %>%
  difference_left_join(y=df2[df2$Friend == df1$Friend,], by=c("DateTime1", "Count"), max_dist=2)

Friend, , , & .

+3

Source: https://habr.com/ru/post/1655505/


All Articles