How to make a "serial connection" in data.table?

I have two data.tables: an experiment data xtable and a category lookup table dict.

library(data.table)
set.seed(123)

x = data.table(samp=c(1,1,2,3,3,3,4,5,5,5,6,7,7,7,8,9,9,10,10), y=rnorm(19))
x

     samp    y
 #1:  1 -0.56047565
 #2:  1 -0.23017749
 #3:  2  1.55870831
 #4:  3  0.07050839
 #5:  3  0.12928774
 #6:  3  1.71506499
 #7:  4  0.46091621
 #8:  5 -1.26506123
 #9:  5 -0.68685285
#10:  5 -0.44566197
#11:  6  1.22408180
#12:  7  0.35981383
#13:  7  0.40077145
#14:  7  0.11068272
#15:  8 -0.55584113
#16:  9  1.78691314
#17:  9  0.49785048
#18: 10 -1.96661716
#19: 10  0.70135590

dict = data.table(samp=c(1:5, 4:8, 7:10), cat=c(rep(1,length(1:5)), rep(2,length(4:8)), rep(3,length(7:10))))

dict
#     samp cat
# 1:  1   1
# 2:  2   1
# 3:  3   1
# 4:  4   1
# 5:  5   1
# 6:  4   2
# 7:  5   2
# 8:  6   2
# 9:  7   2
# 10:  8   2
# 11:  7   3
# 12:  8   3
# 13:  9   3
# 14: 10   3

For each, sampI need to first calculate the product of all yassociated with it. Then I need to calculate the sum of these products for each sample category indicated in dict$cat. Please note that each sampdisplays more than one dict$cat.

One way to do this - merge xand dictimmediately, allowing the duplication of lines ( allow.cartesian=T):

setkey(dict, samp)
setkey(x, samp)
step0 = dict[x, allow.cartesian=T]
setkey(step0, samp, cat)
step1 = step0[, list(prodY=prod(y)[1], cat=cat[1]), by=c("samp", "cat")]
resMet1 = step1[, sum(prodY), by="cat"]

, . - , x , ( ?). , , .

dict$cat x. , , :

setkey(x, samp)
setkey(dict,samp)

pool = vector("list") 
for(n in unique(dict$cat)){
    thisCat = x[J(dict[cat==n])]
    setkey(thisCat, samp)
    step1 = thisCat[, list(prodY=prod(y)[1], cat=cat[1]), by="samp"]
    pool[[n]] = step1[, sum(prodY), by="cat"]        
}
resMet2 = rbindlist(pool)

, , . , - data.table J()?

+4
2

IIUC, : dict$cat prod(y), sample cat, sum .

:

  • dict$cat - , cat:

    dict[, ,by=cat]
    

    , , - j.

  • prod(y) x :

    x[samp %in% .SD$samp, prod(y), by=samp]
    

    x, samp ( .SD, ) prod(y), samp. !

  • .

    sum(x[samp %in% .SD$samp, prod(y), by=samp]$V1)
    
  • j. :

    dict[, sum(x[samp %in% .SD$samp, prod(y), by=samp]$V1), by=cat]
    #    cat         V1
    # 1:   1  1.7770272
    # 2:   2  0.7578771
    # 3:   3 -1.0295633
    

, .


1:, prod(y), , . , . , .. prod(y) :

x_p = x[, .(p = prod(y)), by=samp]

j :

dict[, x_p[samp %in% .SD$samp, sum(p)], by=cat]

2:, %in% x's samp, , , . .

+2

x samp.

xprod = x[, .(py = prod(y)), by=samp]

res2 <- xprod[dict, on = "samp"][, sum(py), by=cat]

identical(res2, resMet2) # test passed

samp - xprod ( ), :

res3 <- xprod[(dict$samp), sum(py), by=.(cat=dict$cat)]

identical(res3, resMet2) # test passed

, .

+2

Source: https://habr.com/ru/post/1611915/


All Articles