Calculation of distances on large vectors [performance]

Hi guys I have the following problem: I have cluster centers in some dimmentions (4-6 clusters) and a very large dataset that I need to assign to each row to the nearest cluster. So this is not a matter of distance, but of performance, my code is as follows:

distances <- matrix(NA, nrow = nrow(ClusterCenters), ncol = nrow(data))
calcData <- data[, colnames(ClusterCenters), drop=FALSE]
for(i in 1:nrow(ClusterCenters)) {
    distances[i,] <- (rowSums((matrix(unlist(apply(calcData, 1, function(x) {x - ClusterCenters[i,]})), ncol = ncol(calcData), byrow = TRUE))^2))^0.5
}
ClusterMemberships <- vector(mode="numeric", length=nrow(calcData))
for(i in 1: nrow(calcData)) {
  ClusterMemberships[i] <- which.min(distances[,i])
}
return(ClusterMemberships)

Is there any way to speed it up? I am working on a windows server.

+2
source share
2 answers

For a matrix of data rows of 50 x 1 million in size, matched with six clusters with 50 values ​​each, I get the result after about 3 seconds:

vals <- 50
clusts <- 6
clusters <- matrix(runif(vals * clusts), nrow=clusts)

data.count <- 1e6  # large number
data <- matrix(runif(data.count * vals), nrow=data.count)

system.time({
  dists <- apply(clusters, 1, function(x) rowSums((data - x) ^ 2) ^ .5)
  min.dist <- max.col(-dists, ties.method="first")
})
# user  system elapsed 
# 2.96    0.47    3.49 

, R, . , apply ( ) , . , ( data , , , ).

@user20650 max.col.

+2

R, BrodieG.

, , , .

[Michael McCool .., - ].

. KNN.

1.

system.time (1e6), , , , 95% . , .

# Original code
vals <- 50
clusts <- 6
ClusterCenters <- matrix(runif(vals * clusts), nrow=clusts)

data.count <- 1e6  # large number
calcData <- matrix(runif(data.count * vals), nrow=data.count)

system.time({
   for(i in 1:nrow(ClusterCenters)) {
     dists[i,] <- (rowSums((matrix(unlist(apply(calcData, 1, function(x) {x      ClusterCenters[i,]})), ncol = ncol(calcData), byrow = TRUE))^2))^0.5
   }
})
user  system elapsed 
71.62    1.13   73.13  

system.time({
   for(i in 1: nrow(calcData)) {
     ClusterMemberships[i] <- which.min(dists[,i])
   }
})
user  system elapsed 
5.29    0.00    5.31 

2.

- R, , @BrodieG. BTW, , , 3-5X, .

#Vectorization:  From BrodieG
dists1 <-matrix(NA, nrow = nrow(ClusterCenters), ncol = nrow(calcData))  system.time({
  dists1 <- apply(ClusterCenters, 1, function(x) rowSums(sweep(calcData, 2,x, '-') ^ 2) ^ .5)
  min.dist.vec <- max.col(-dists1, ties.method="first")
})
user  system elapsed 
16.13    1.42   17.61  
all.equal(ClusterMemberships, min.dist.vec)
[1] TRUE

3.

, , , (calcData [i,] - ClusterCenters [j,]) ^ 2.

, , , :

calcData [i,] ^ 2 - 2 * calcData [i,] * ClusterCenters [j,] + ClusterCenters [J,] ^ 2

, ,

calcData * calcData​​p >

,

ClusterCenters% *% t (calcData)

, , :

# Pattern Representation 1: Matrix 
dists2 <-matrix(NA, nrow = nrow(ClusterCenters), ncol = nrow(calcData))
system.time({
  data2     <- rowSums(calcData*calcData)
  clusters2 <- rowSums(ClusterCenters*ClusterCenters)
  # NVIDIA GPU: nvBLAS can speedup this step
  # Futher Info on ParallelR.com
  ClustersXdata <- calcData %*% t(ClusterCenters)
  # compute distance 
  dists2  <- sweep(data2 - 2 * ClustersXdata, 2, clusters2, '+') ^0.5
  min.dist.matrix <- max.col(-dists2, ties.method="first")
})
user  system elapsed 
1.17    0.09    1.28
all.equal(ClusterMemberships, min.dist.matrix)
[1] TRUE

, . 10 ^ 3 10 ^ 7, 50X , 16 calcData. Compare three algorithm

4.

1- , KNN k = 1. knn , C. , . , :

# Pattern Representation 2: KNN 
library("class")
system.time(
  min.dist.knn <- knn(ClusterCenters, calcData, cl = 1:nrow(ClusterCenters), k = 1)
)
user  system elapsed 
1.21    0.12    1.35 

all.equal(ClusterMemberships, as.integer(min.dist.knn))
[1] TRUE

KNN 1e6, , , , 2X , KNN (15.9.vs. 29.1).

,, , , , c/++ . KNN NVIDIA, ParallelR

+2

Source: https://habr.com/ru/post/1620034/


All Articles