Criteria evaluation for cluster generation in R

Question

The code below finds the "optimal" number of clusters. In the example below, the result was 2 clusters.

Quickly explaining the code: first, the Ideal Point is calculated, which has the minimum breadth of coverage and the maximum waste production value. Then, the Final Solution is selected as the closest Sk solution to the Ideal Point, and k is selected as the best number of clusters. For my purpose this code works fine. However, I would like to know if there is any approach to evaluating the criteria that help to select the number of clusters for this example? In this case, the criteria are breadth of coverage and waste generation potential. Maybe use some multicriteria method? I'm open to suggestions.

library(rdist)
library(geosphere)
library(dplyr)


df1<-structure(c(3315.15739850453, 2589.99900391847, 8869.03953528711,7708.11156943467, 
                 7708.11156943467, 6015.73943344633, 6577.67722745424,5805.83051830159, 
                 4791.95901175901, 4791.95901175901, 4791.95901175901, 
                 4617.00232604443, 4430.08754078434, 4430.08754078434, 4430.08754078434, 
                 4483.18948278638, 4483.18948278638, 3302.09189638597, 
                 1635156.04305,474707.64025, 170773.40775, 64708.312, 64708.312, 
                 64708.312, 949.72635, 949.72635, 949.72635, 949.72635, 949.72635, 949.72635, 
                 949.72635, 949.72635, 949.72635, 949.72635, 949.72635, 949.72635), .Dim = c(18L, 2L
                 ), .Dimnames = list(NULL, c("Breadth of Coverage", "Waste")))


df2<-structure(c(14833.1911512297, 11518.0337527251, 10088.9627146591,8928.03474880667, 
                 8928.03474880667, 7235.66261281833, 7235.66261281833,6463.81590366569, 
                 5449.94439712311, 5449.94439712311, 5449.94439712311, 
                 5274.98771140853, 5088.07292614843, 5088.07292614843, 5088.07292614843, 
                 5088.07292614843,5088.07292614843,3906.975,3315.15739850453, 
                 2589.99900391847, 8869.03953528711, 7708.11156943467, 7708.11156943467, 
                 6015.73943344633, 6577.67722745424, 5805.83051830159, 4791.95901175901, 
                 4791.95901175901, 4791.95901175901, 4617.00232604443, 4430.08754078434, 
                 4430.08754078434, 4430.08754078434, 4483.18948278638,4483.189,4483.189,
                 1635156.04305, 474707.64025, 170773.40775,64708.312, 64708.312, 64708.312, 
                 949.72635, 949.72635, 949.72635,949.72635, 949.72635, 949.72635, 949.72635, 
                 949.72635, 949.72635,949.72635,949.7264,949.7264),
                .Dim = c(18L, 3L),.Dimnames = list(NULL, c("Coverage","Breadth of Coverage", "Waste")))

#Ideal Point is considered the minimum breadth of coverage and maximum production of Waste
IdealPoint<-as.matrix(t(c(min(df1[,1]),max(df1[,2]))))
distance_df1_Ideal<-as.matrix(dist(rbind(df1,IdealPoint)))

#calculating the distance of the cluster solutions to the ideal point
distance_cluster_ideal<-min(distance_df1_Ideal[as.matrix(dim(df1))[1,1]+1,1:as.matrix(dim(df1))[1,1]])
a<-which(distance_df1_Ideal[dim(df1)[1]+1,]==distance_cluster_ideal)
FinalSolution<-df1[a[1],]

f1 <- function(mat, vec, dim = 1, tol = 1e-7, fun = all)
  which(apply(mat, dim, function(x) fun(dist(x - vec) < tol)))

b<-as.matrix(f1(df2[,2:3],FinalSolution,fun=any))# optimal value of number of clusters

k<-b[1]+1 #number of clusters
> k
[1] 2

Is the question about the algorithms or metrics to decide on clusters, or the code to implement it? If the former, it should be moved to [stats.se] — camille, Feb 03 '22 at 18:59

score 0 · Answer 1 · answered Feb 05 '22 at 13:51

Maybe you can use the Silhouette Coefficient or also called Silhouette Index. The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation).

The Silhouette Index can be calculated with any clustering technique like kmeans, pam(kmedoids),hcut,clara,funny... and any distance metric, such as: Euclidean distance, Manhattan distance, Mahalanobis distance ...

Here I give you an example using your code and a comparison to the silhouette index for best k found:

library(rdist)
library(geosphere)
library(dplyr)


df1<-structure(c(3315.15739850453, 2589.99900391847, 8869.03953528711,7708.11156943467, 
                 7708.11156943467, 6015.73943344633, 6577.67722745424,5805.83051830159, 
                 4791.95901175901, 4791.95901175901, 4791.95901175901, 
                 4617.00232604443, 4430.08754078434, 4430.08754078434, 4430.08754078434, 
                 4483.18948278638, 4483.18948278638, 3302.09189638597, 
                 1635156.04305,474707.64025, 170773.40775, 64708.312, 64708.312, 
                 64708.312, 949.72635, 949.72635, 949.72635, 949.72635, 949.72635, 949.72635, 
                 949.72635, 949.72635, 949.72635, 949.72635, 949.72635, 949.72635), .Dim = c(18L, 2L
                 ), .Dimnames = list(NULL, c("Breadth of Coverage", "Waste")))


df2<-structure(c(14833.1911512297, 11518.0337527251, 10088.9627146591,8928.03474880667, 
                 8928.03474880667, 7235.66261281833, 7235.66261281833,6463.81590366569, 
                 5449.94439712311, 5449.94439712311, 5449.94439712311, 
                 5274.98771140853, 5088.07292614843, 5088.07292614843, 5088.07292614843, 
                 5088.07292614843,5088.07292614843,3906.975,3315.15739850453, 
                 2589.99900391847, 8869.03953528711, 7708.11156943467, 7708.11156943467, 
                 6015.73943344633, 6577.67722745424, 5805.83051830159, 4791.95901175901, 
                 4791.95901175901, 4791.95901175901, 4617.00232604443, 4430.08754078434, 
                 4430.08754078434, 4430.08754078434, 4483.18948278638,4483.189,4483.189,
                 1635156.04305, 474707.64025, 170773.40775,64708.312, 64708.312, 64708.312, 
                 949.72635, 949.72635, 949.72635,949.72635, 949.72635, 949.72635, 949.72635, 
                 949.72635, 949.72635,949.72635,949.7264,949.7264),
                .Dim = c(18L, 3L),.Dimnames = list(NULL, c("Coverage","Breadth of Coverage", "Waste")))

#Ideal Point is considered the minimum breadth of coverage and maximum production of Waste
IdealPoint<-as.matrix(t(c(min(df1[,1]),max(df1[,2]))))

distance_df1_Ideal<-as.matrix(dist(rbind(df1,IdealPoint)))

#calculating the distance of the cluster solutions to the ideal point
distance_cluster_ideal<-min(distance_df1_Ideal[as.matrix(dim(df1))[1,1]+1,1:as.matrix(dim(df1))[1,1]])

a<-which(distance_df1_Ideal[dim(df1)[1]+1,]==distance_cluster_ideal)

FinalSolution<-df1[a[1],]

f1 <- function(mat, vec, dim = 1, tol = 1e-7, fun = all)
  which(apply(mat, dim, function(x) fun(dist(x - vec) < tol)))

b<-as.matrix(f1(df2[,2:3],FinalSolution,fun=any))# optimal value of number of clusters

k<-b[1]+1 #number of clusters

k

## Silhouette analysis 

library(factoextra)

rf <- fviz_nbclust(df2, pam, method = "silhouette")

rf$data$clusters[which.max(rf$data$y)]

rf$data$clusters[which.max(rf$data$y)] == k

plot(rf)

rf <- fviz_nbclust(df1, kmeans, method = "silhouette")

rf$data$clusters[which.max(rf$data$y)]

rf$data$clusters[which.max(rf$data$y)] == k

plot(rf)

Here you have the output for the last part of the code:

> library(factoextra)
> 
> rf <- fviz_nbclust(df2, pam, method = "silhouette")
> 
> rf$data$clusters[which.max(rf$data$y)]
[1] 2
Levels: 1 2 3 4 5 6 7 8 9 10
> 
> rf$data$clusters[which.max(rf$data$y)] == k
[1] TRUE
> 
> plot(rf)
> 
> rf <- fviz_nbclust(df1, kmeans, method = "silhouette")
> 
> rf$data$clusters[which.max(rf$data$y)]
[1] 2
Levels: 1 2 3 4 5 6 7 8 9 10
> 
> rf$data$clusters[which.max(rf$data$y)] == k
[1] TRUE
> 
> plot(rf)
>

As you can see, k=2 seems to be the best fit !

Criteria evaluation for cluster generation in R

1 Answers1