Data.table - left outer join on multiple tables

Suppose you have data such as

fruits <- data.table(FruitID=c(1,2,3), Fruit=c("Apple", "Banana", "Strawberry"))
colors <- data.table(ColorID=c(1,2,3,4,5), FruitID=c(1,1,1,2,3), Color=c("Red","Yellow","Green","Yellow","Red"))
tastes <- data.table(TasteID=c(1,2,3), FruitID=c(1,1,3), Taste=c("Sweeet", "Sour", "Sweet"))

setkey(fruits, "FruitID")
setkey(colors, "ColorID")
setkey(tastes, "TasteID")

fruits
   FruitID      Fruit
1:       1      Apple
2:       2     Banana
3:       3 Strawberry

colors
   ColorID FruitID  Color
1:       1       1    Red
2:       2       1 Yellow
3:       3       1  Green
4:       4       2 Yellow
5:       5       3    Red

tastes
   TasteID FruitID  Taste
1:       1       1 Sweeet
2:       2       1   Sour
3:       3       3  Sweet

I usually need to do left-outer join of data like this. For example, “give me all the fruits and their colors” requires me to write (or maybe the best way?)

setkey(colors, "FruitID")
result <- colors[fruits, allow.cartesian=TRUE]
setkey(colors, "ColorID")

Three lines of code for such a simple and frequent task seemed excessive, so I wrote a method myLeftJoin

myLeftJoin <- function(tbl1, tbl2){
  # Performs a left join using the key in tbl1 (i.e. keeps all rows from tbl1 and only matching rows from tbl2)

  oldkey <- key(tbl2)
  setkeyv(tbl2, key(tbl1))
  result <- tbl2[tbl1, allow.cartesian=TRUE]
  setkeyv(tbl2, oldkey)
  return(result)
}

which i can use as

myLeftJoin(fruits, colors)
   ColorID FruitID  Color      Fruit
1:       1       1    Red      Apple
2:       2       1 Yellow      Apple
3:       3       1  Green      Apple
4:       4       2 Yellow     Banana
5:       5       3    Red Strawberry

How can I extend this method so that I can pass any number of tables to it and get the associated left outer join of all of them? Sort ofmyLeftJoin(tbl1, ...)

For example, I would like the result to myleftJoin(fruits, colors, tastes)be equivalent

setkey(colors, "FruitID")
setkey(tastes, "FruitID")
result <- tastes[colors[fruits, allow.cartesian=TRUE], allow.cartesian=TRUE]
setkey(tastes, "TasteID")
setkey(colors, "ColorID")

result
   TasteID FruitID  Taste ColorID  Color      Fruit
1:       1       1 Sweeet       1    Red      Apple
2:       2       1   Sour       1    Red      Apple
3:       1       1 Sweeet       2 Yellow      Apple
4:       2       1   Sour       2 Yellow      Apple
5:       1       1 Sweeet       3  Green      Apple
6:       2       1   Sour       3  Green      Apple
7:      NA       2     NA       4 Yellow     Banana
8:       3       3  Sweet       5    Red Strawberry

Perhaps there is an elegant solution using the methods in the data.table package that I skipped? Thanks

(EDIT: Bug fixed in my data)

+4
2

data.table, v1.9.5, ( , setkey()):

:

require(data.table) # v1.9.5+
fruits[tastes, on="FruitID"][colors, on="FruitID"] # no setkey required
#    FruitID      Fruit TasteID  Taste ColorID  Color
# 1:       1      Apple       1 Sweeet       1    Red
# 2:       1      Apple       2   Sour       1    Red
# 3:       1      Apple       1 Sweeet       2 Yellow
# 4:       1      Apple       2   Sour       2 Yellow
# 5:       1      Apple       1 Sweeet       3  Green
# 6:       1      Apple       2   Sour       3  Green
# 7:       2         NA      NA     NA       4 Yellow
# 8:       3 Strawberry       3  Sweet       5    Red
+8

R Reduce to left_join (from dplyr) data.table , , keys data.table

library(data.table) # <= v1.9.4
library(dplyr) # left_join

Reduce(function(...) left_join(...), list(fruits,colors,tastes))

# Source: local data table [8 x 6]

#  FruitID      Fruit ColorID  Color TasteID  Taste
#1       1      Apple       1    Red       1 Sweeet
#2       1      Apple       1    Red       2   Sour
#3       1      Apple       2 Yellow       1 Sweeet
#4       1      Apple       2 Yellow       2   Sour
#5       1      Apple       3  Green       1 Sweeet
#6       1      Apple       3  Green       2   Sour
#7       2     Banana       4 Yellow      NA     NA
#8       3 Strawberry       5    Red       3  Sweet

data.table @Frank ( , data.table fruitID)

library(data.table) # <= v1.9.4
Reduce(function(x,y) y[x, allow.cartesian=TRUE], list(fruits,colors,tastes))
+6

Source: https://habr.com/ru/post/1598412/


All Articles