The value of the data.table column values ​​specified using the matrix

I have a data table containing x, y, z values ​​of 10,000 points (for this example) in a unit cube, and each point has a corresponding attribute (called P). I used nn2from the package RANNto find the k-neighbor indices (up to 50) of each point within a radius of 0.075 units from the original data.frame (which returns as a matrix).

library(RANN)
library(data.table)

set.seed(1L) # for reproducible data
DATA <- data.table(runif(10000, 0,1), 
                   runif(10000, 0,1), 
                   runif(10000, 0,1), 
                   runif(10000, 10,30))
colnames(DATA)<-c("x","y","z","P")
nn.idx <- nn2(DATA[,1:3], DATA[,1:3], k=50, 
              treetype = "kd", searchtype = "radius", 
              radius = 0.075)$nn.idx

The next loop fordoes the job, but I was wondering if there is a way to speed it up by vectorizing it, because it will not scale when applied to> millions of points? Simply put, I want to use nn.idxto get the corresponding values Pfrom DATAand calculate the average P, which is then assigned to a new column in DATA, calledmean.P

for(index in 1:nrow(DATA))
  DATA$mean.P[index]<-mean(DATA[nn.idx[index,], P])

For illustrative purposes, the following code illustrates what I'm trying to calculate — for all points (gray dots), it computes the average of all points (orange + red dots) in the sphere around a given point (red dot) and assign it to that point (red dot ) Iterate over all points, but do it in an efficient way that will scale for large datasets.

library(rgl)
rgl.open()
rgl.points(DATA[1500,1], DATA[1500,2], DATA[1500,3], color ="red")
rgl.points(DATA[nn.idx[1500,],1:3], color ="orange", add=T)
rgl.points(DATA[,1:3], color ="lightgray", alpha=0.1, add=T)

enter image description here

, ! , , ++ Rcpp, , , R, . !

+4
2

, 100- . , , , , . .

library(RANN)
library(data.table)

set.seed(1L) # for reproducible data
DATA <- data.table(runif(10000, 0,1), 
                   runif(10000, 0,1), 
                   runif(10000, 0,1), 
                   runif(10000, 10,30))
colnames(DATA)<-c("x","y","z","P")
nn.idx <- nn2(DATA[,1:3], DATA[,1:3], k=50, 
              treetype = "kd", searchtype = "radius", 
              radius = 0.075)$nn.idx

# (1)
# Timing for original loop.
system.time(for(index in 1:nrow(DATA)) {
    DATA$mean.P[index] <- mean(DATA[nn.idx[index,], P])
})
#    user  system elapsed 
#   7.830   0.850   8.684 

# (2)
# Use `set()` instead of `$<-` and `[<-`.
system.time({for(index in 1:nrow(DATA)) {
    set(DATA, i=index, j="mean_P_2", value=mean(DATA[nn.idx[index, ], P]))
}})
#    user  system elapsed 
#   3.405   0.008   3.417 

, 2- , set(), data.table, .

, data.table( data.table []). P , , data.frames data.tables.

# (3)
# Add row index.
DATA[, row_idx:=seq(nrow(DATA))]

# Isolate P values in a vector, because vector access is cheaper
# than data.table or data.frame access.
P_vec = DATA$P

system.time({
    # Create a list column where each element is a vector of 50 integer indexes.
    DATA[, nn_idx:=lapply(row_idx, function(i) nn.idx[i, ])]
    # Use `:=` and `by=` to internalize the loop within `[.data.table`.
    DATA[, mean_P_3:=mean(P_vec[nn_idx[[1]]]), by=row_idx]
})
#    user  system elapsed 
#   0.092   0.002   0.095 

# All results are identical.
all.equal(DATA$mean.P, DATA$mean_P_2)
# [1] TRUE
all.equal(DATA$mean.P, DATA$mean_P_3)
# [1] TRUE

100- .

, 1 :

# Try with 1 million data points.
set.seed(1L) # for reproducible data
DATA2 <- data.table(runif(1e6, 0,1), 
                    runif(1e6, 0,1), 
                    runif(1e6, 0,1), 
                    runif(1e6, 10,30))
colnames(DATA2) <- c("x","y","z","P")

system.time({
    nn.idx2 <- nn2(DATA2[,1:3], DATA2[,1:3], k=50, 
                   treetype = "kd", searchtype = "radius", 
                   radius = 0.075)$nn.idx
})
#    user  system elapsed 
# 346.603   1.883 349.708 


DATA2[, row_idx:=seq(nrow(DATA2))]
P_vec = DATA2$P

system.time({
    DATA2[, nn_idx:=lapply(row_idx, function(i) nn.idx2[i, ])]
    DATA2[, mean_P:=mean(P_vec[nn_idx[[1]]]), by=row_idx]
})
#    user  system elapsed 
#  15.685   0.587  16.297 

macbook pro 2011 (Sandy Bridge 2.2Ghz). 1,5 .

+2

, melt() , :

long <- melt(as.data.table(nn.idx)[, pt := .I], measure.vars = patterns("V"))
tmp <- long[DATA[, pt := .I], on = .(value = pt)][, mean(P), by = .(pt)][order(pt), V1]
DATA[, mean.P := tmp][, pt := NULL][]

nn.idx pt, . .

tmp - . , DATA long, ( value) , DATA .

- DATA.

2

:

long <- melt(as.data.table(nn.idx)[, pt := .I], measure.vars = patterns("V"))
    long[DATA[, pt := .I], on = .(value = pt)][, mean(P), by = .(pt)][DATA, on = "pt"]
0

Source: https://habr.com/ru/post/1686276/


All Articles