: , , , . , , OP ( ), . .
OP, 30 , ( : 250 * 2000 * 5 = 2,5 ))
, . data.table .
NA.
data.table :
rleid()
library(data.table)
# coerce to data.table
setDT(Returns)
# convert character dates
Returns[, Date := as.IDate(Date, "%d.%m.%y")][]
# make sure data is ordered
setorder(Returns, Company, Date)[]
Returns[, Company %like% "DEAD" & r == 0 & rleid(r == 0) == max(rleid(r == 0)),
by = Company]
:
Company V1
1: ORKLA FALSE
2: ORKLA FALSE
3: ORKLA FALSE
4: VISMA DEAD 04.09.17 FALSE
5: VISMA DEAD 04.09.17 FALSE
6: VISMA DEAD 04.09.17 FALSE
7: VISMA DEAD 04.09.17 TRUE
8: VISMA DEAD 04.09.17 TRUE
9: VISMA DEAD 04.09.17 TRUE
10: XNEWCO FALSE
11: XNEWCO FALSE
12: XNEWCO FALSE
V1 DT :
Returns[Returns[, Company %like% "DEAD" & r == 0 & rleid(r == 0) == max(rleid(r == 0)),
by = Company]$V1, r := NA_real_][]
Date Company r
1: 2017-09-04 ORKLA 0.04
2: 2017-09-05 ORKLA 0.00
3: 2017-09-06 ORKLA -0.02
4: 2017-09-01 VISMA DEAD 04.09.17 0.01
5: 2017-09-02 VISMA DEAD 04.09.17 0.00
6: 2017-09-03 VISMA DEAD 04.09.17 -0.03
7: 2017-09-04 VISMA DEAD 04.09.17 NA
8: 2017-09-05 VISMA DEAD 04.09.17 NA
9: 2017-09-06 VISMA DEAD 04.09.17 NA
10: 2017-09-04 XNEWCO 0.01
11: 2017-09-05 XNEWCO 0.00
12: 2017-09-06 XNEWCO 0.00
Returns[, {tmp <- last(which(r != 0))
if (Company %like% "DEAD" & tmp < .N) .I[seq.int(tmp + 1L, .N)]}, by = Company]
, . .I .N data.table. if (Company %like% "DEAD" & tmp < .N) .
Company V1
1: VISMA DEAD 04.09.17 7
2: VISMA DEAD 04.09.17 8
3: VISMA DEAD 04.09.17 9
, V1 Returns :
Returns[Returns[, {tmp <- last(which(r != 0))
if (Company %like% "DEAD" & tmp < .N) .I[seq.int(tmp + 1L, .N)]},
by = Company]$V1, r := NA_real_][]
Date Company r
1: 2017-09-04 ORKLA 0.04
2: 2017-09-05 ORKLA 0.00
3: 2017-09-06 ORKLA -0.02
4: 2017-09-01 VISMA DEAD 04.09.17 0.01
5: 2017-09-02 VISMA DEAD 04.09.17 0.00
6: 2017-09-03 VISMA DEAD 04.09.17 -0.03
7: 2017-09-04 VISMA DEAD 04.09.17 NA
8: 2017-09-05 VISMA DEAD 04.09.17 NA
9: 2017-09-06 VISMA DEAD 04.09.17 NA
10: 2017-09-04 XNEWCO 0.01
11: 2017-09-05 XNEWCO 0.00
12: 2017-09-06 XNEWCO 0.00
Benchmark
Hack-R , 1 . , .
library(data.table)
# create benchmark data
n_days <- 100L
n_comp <- 100L
n_dead <- round(0.1 * n_comp) # 10 per cent of companies are dead
Date <- seq(from = as.IDate("2015-01-01"), length.out = n_days, by = "1 day")
# company "names" consist of 4 digits at least
Company <- sprintf("%04i", seq_len(n_comp))
# cross join to create all combinations
Returns <- CJ(Date = Date, Company = Company)
set.seed(1L) # reuired for reproducible result
Returns[, r := round(rnorm(.N)/10.0, 2L)][]
# dead companies
dead <- data.table(Company = sample(Company, n_dead),
dead.date = sample(Date, n_dead))
# modify Returns
Returns[dead, on = .(Company, Date >= dead.date), r := 0]
# modify compay names
Returns[dead, on = "Company", Company := paste(Company, "DEAD", dead.date)]
# IMPORTANT: set order
setorder(Returns, Company, Date)
# keep original version
R0 <- copy(Returns)
microbenchmark::microbenchmark(
copy = Returns <- copy(R0),
hackr1 = {
mydat <- setDF(copy(R0))
for(i in 1:nrow(mydat)){
if(i==nrow(mydat) & mydat$r[i]==0) {mydat$r[i] <- NA
} else if(!is.na(mydat$r[i]) & mydat$r[i]==0 & mydat$r[i+1]==0) mydat$r[i] <- NA
}
res_hackr1 <- mydat
},
hackr2 = {
mydat <- copy(R0)
tmp0 <- mydat[0,]
for(c in unique(mydat$Company)){
tmp <- mydat[mydat$Company==c,]
for(i in 1:nrow(tmp)){
if(!is.na(tmp$r[i]) & tmp$r[i]==0 & tmp$r[i+1]==0){
tmp$r[i:nrow(tmp)] <- NA
}
}
tmp0 <- rbind(tmp0, tmp)
}
res_hackr2 <- tmp0
},
dt_rleid1 = {
Returns <- copy(R0)
Returns[Returns[, Company %like% "DEAD" & r == 0 & rleid(r == 0) == max(rleid(r == 0)),
by = Company]$V1, r := NA_real_]
res_dt_rleid1 <- copy(Returns)
},
dt_rleid2 = {
Returns <- copy(R0)
Returns[Company %like% "DEAD" & Returns[, r == 0 & rleid(r == 0) == max(rleid(r == 0)),
by = Company]$V1, r := NA_real_]
res_dt_rleid2 <- copy(Returns)
},
dt_last = {
Returns <- copy(R0)
Returns[Returns[, {
tmp <- last(which(r != 0))
if (Company %like% "DEAD") .I[tmp + seq_len(.N - tmp)]
},
by = Company]$V1, r := NA_real_]
res_dt_last <- copy(Returns)
},
dt_last2 = {
Returns <- copy(R0)
Returns[Returns[, {
tmp <- last(which(r != 0))
if (Company %like% "DEAD" & tmp < .N) .I[seq.int(tmp + 1L, .N)]
},
by = Company]$V1, r := NA_real_]
res_dt_last2 <- copy(Returns)
},
times = 11L
)
, copy() "" , . , copy().
dt_rleid1 dt_rleid2, dt_last dt_last2 .
, hackr2 :
if (! is.na(tmp $r [i]) tmp $r [i] == 0 tmp $r [i + 1] == 0) {:
TRUE/FALSE
:
Unit: microseconds
expr min lq mean median uq max neval cld
copy 46.065 48.331 53.75427 52.485 58.1475 66.077 11 a
hackr1 267515.143 269559.179 277240.15827 271093.857 275196.8435 329919.874 11 b
dt_rleid1 2203.942 2404.060 3130.73218 2690.267 3728.9925 4813.783 11 a
dt_rleid2 2577.370 2665.346 5750.63073 2700.839 2741.0510 36395.429 11 a
dt_last 1605.098 1627.564 1718.85318 1654.561 1724.6030 2036.296 11 a
dt_last2 1665.134 1718.372 1945.67645 1764.438 1769.5350 3909.476 11 a
data.table , Hack-R 100 100 = 10 . Hack-R 1000 x 1000 = 1 M , .
1 M , , rleid() 5-6 .
Unit: milliseconds
expr min lq mean median uq max neval cld
copy 6.602008 6.843094 21.23383 7.297889 13.61614 141.5794 11 a
dt_rleid1 63.282609 70.239165 142.21568 193.972143 199.32077 224.5657 11 b
dt_rleid2 157.939571 281.185658 266.62148 288.184692 291.61445 309.5796 11 c
dt_last 35.826792 39.198781 101.66298 48.387030 172.40187 182.2354 11 b
dt_last2 36.507194 43.754676 103.95414 48.879018 173.66035 183.1639 11 b