I have the following data:
head(df.num1)
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
Then I run kmeans as shown below:
set.seed(111)
km_out <- kmeans(df.num1,centers=3) #perform kmeans cluster with k=3
Now we calculate the distance between objects and cluster centers to determine the outliers and identify, for example, the 5 largest distances that are outliers (arbitrary identification).
centers <- km_out$centers[km_out$cluster, ]
distances <- sqrt(rowSums((df.num1 - centers)^2))
(outliers <- order(distances, decreasing=T)[1:5])
allows you to get a data block with added distance:
df.num1$distance<-distances
df.num1$cluster<-km_out$cluster
print out emissions information (the largest five distance values)
(df.num1[outliers,])
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
#
But these are just data points selected based on the greatest distance from the centers of the clusters .....
, , - -, , , z ( > 2sd, outlier) , ()........
, :

:


, / , ...