Merge for all "occurrences" of a non-key variable

I have two data sets, and what I want can be conditionally called "external join by a non-key variable".

Here are the datasets

Dataset 1

oc  oc2 state_id    r_state 
A011    A01 1808    1.00    
A011    A01 1810    0.50    
A012    A01 1810    0.50    
A011    A01 1814    0.33    
A012    A01 1814    0.33    
A013    A01 1814    0.33    

Dataset 2

oc  r_country
A011    0.62
A012    0.14
A013    0.24

what i want as output:

oc  oc2 state_id    r_state r_country
A011    A01 1808    1.00    0.62
A012    A01 1808    NA      0.14
A013    A01 1808    NA      0.24
A011    A01 1810    0.50    0.62
A012    A01 1810    0.50    0.14
A013    A01 1810    NA      0.24
A011    A01 1814    0.33    0.62
A012    A01 1814    0.33    0.62
A013    A01 1814    0.33    0.24

, 2, 3 6 "". , R. oc, merge . oc2, . , oc2 - oc. data.tables. , 50 oc2, oc2 1 9 oc. , 47 .

DT1 = data.table(oc = c('A011','A011','A012','A011','A012','A013'),
                 oc2 = rep('A01',6),
                 state_id = c(1808,1810,1810,1814,1814,1814),
                 r_state = c(1, 0.5,0.5,0.33,0.33,0.33)
                )

DT2 = data.table(oc = c('A011','A012','A013'), 
                  r_country = c(0.62, 0.14, 0.24)
                )

data.table, data.frame.

+4
2

:

library(zoo) # for the 'na.locf'-function

DT1[CJ(oc = oc, state_id = state_id, unique = TRUE), on = .(oc, state_id)
    ][order(state_id), oc2 := na.locf(oc2), by = 'state_id'
      ][DT2, on = 'oc', r_country := r_country][order(state_id)]

:

     oc oc2 state_id r_state r_country
1: A011 A01     1808    1.00      0.62
2: A012 A01     1808      NA      0.14
3: A013 A01     1808      NA      0.24
4: A011 A01     1810    0.50      0.62
5: A012 A01     1810    0.50      0.14
6: A013 A01     1810      NA      0.24
7: A011 A01     1814    0.33      0.62
8: A012 A01     1814    0.33      0.14
9: A013 A01     1814    0.33      0.24

@Frank , na.locf zoo -:

DT1[CJ(oc = oc, state_id = state_id, unique = TRUE), on = .(oc, state_id)
    ][DT2, on = .(oc), r_country := i.r_country][DT1, on = .(state_id), oc2 := i.oc2][]
+5

SQL, , :

  • oc, oc,
  • state_id, state_id,
  • DT1
  • DT2

:

library(sqldf)

sql <- "
     with 
       oc as (select distinct oc from DT1),
       state_id as (select distinct state_id from DT1)

     select * 
       from oc
       join state_id
       left join DT1 using (oc, state_id)
       left join DT2 using (oc)
       order by state_id, oc"

sqldf(sql)

:

    oc state_id  oc2 r_state r_country
1 A011     1808  A01    1.00      0.62
2 A012     1808 <NA>      NA      0.14
3 A013     1808 <NA>      NA      0.24
4 A011     1810  A01    0.50      0.62
5 A012     1810  A01    0.50      0.14
6 A013     1810 <NA>      NA      0.24
7 A011     1814  A01    0.33      0.62
8 A012     1814  A01    0.33      0.14
9 A013     1814  A01    0.33      0.24
+1

Source: https://habr.com/ru/post/1691092/


All Articles