I am asking this question for a homework on vector quantization.
I have implemented a rather classic algorithm to detect the center of a cluster of points. However in the input data there are several clusters (number of clusters and total inputs are known) and I need to find the center of each cluster but I don't know which points make a cluster. So if I manage to initialize my future center points inside or somewhere near the cluster (closer than any other initialized centers), my algorithm can iterate and go to the correct center.
However I don't know how to properly initialize. I am initializing randomly and checking if two centers are too close to each other and if a center is too far from any input point but this method isn't easy to parametrize, i.e. taking to much "computing" time or not getting the right centers.
My idea is simple, initialize randomly and check whether the point is inside a cluster. Someone knows how can I do that? I cannot construct a polygon since I don't know the limits of the cluster. I would prefer an implementation in C but I take just the ideas as well!
Edit: An example of the input data:
My Code:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <float.h>
#define TRAINING_CYCLE 10000
#define LEARNING_RATE 0.001
#define CENTROID_DISTANCE_SCALE 0.7 //used for setting a minimal distance between centroids
#define CENTROID_POINT_SCALE 0.1
#define CLUSTER_SIZE_PERCENTAGE 0.3
//User: REMOVED
//Password: REMOVED
typedef enum { false, true } bool;
typedef struct point{
double x;
double y;
} point;
int nbOfClusters;
double in1[1000];
double in2[1000];
point centers[100]; //later it is limited by number of clusters
int dataSize=0;
double maxX1, maxY1, maxX2, maxY2=0; //maximums of each data set
double deltaX, deltaY=0; //error toleration of each axis
double getAbs(double n){
if (n>=0){
return n;
} else {
return (-1)*n;
}
}
int findNearestCentroid(point p1){ //returns the location in the table of the nearest centroid to the argument point
double distance=DBL_MAX;
int nearest=0;
for (int i=0; i<nbOfClusters; i++){
double distance_temp = (p1.x-centers[i].x)*(p1.x-centers[i].x)+(p1.y-centers[i].y)*(p1.y-centers[i].y);
if ( distance_temp < distance){
distance=distance_temp;
nearest=i;
}
}
return nearest;
}
double getDistance(point p1, point p2){
return sqrt((p1.x-p2.x)*(p1.x-p2.x)+(p1.y-p2.y)*(p1.y-p2.y));
}
bool isCentroidsNear(double minDistance){
for (int i=0;i<nbOfClusters;i++){
for (int j=0; j<nbOfClusters; j++){
if (i != j){
double temp_distance=getDistance(centers[i],centers[j]);
if (temp_distance<minDistance){ // the distance shouldn't be small
return true;
}
}
}
}
return false; //if nothing hit the condition, there is no centroid too close to another
}
point findNearestInput(int centroid){ //returns the location in the table of the nearest centroid to the argument point
double distance=DBL_MAX;
point returnPoint;
int nearest=0;
for (int i=0; i<nbOfClusters; i++){
double distance_temp = (in1[i]-centers[centroid].x)*(in1[i]-centers[centroid].x)+(in2[i]-centers[centroid].y)*(in2[i]-centers[centroid].y);
if ( distance_temp < distance){
distance=distance_temp;
nearest=i;
}
}
returnPoint.x=in1[nearest];
returnPoint.y=in2[nearest];
return returnPoint;
}
bool isPointNear(double minDistance){
for(int i=0; i<nbOfClusters; i++){
double distance=getDistance(findNearestInput(i),centers[i]); //the distance to the nearest point
if(distance>minDistance){
return true;
}
}
return false;
}
bool isCountNearPoints(double distance){
int counter=0;
for(int i=0;i<nbOfClusters;i++){
point p;
for(int j=0; j<dataSize; j++){
p.x=in1[j];
p.y=in2[j];
double tempDistance=getDistance(p,centers[i]);
if (tempDistance<distance){
counter++;
}
}
//this is the number of points that the centroid should be near to
int minNearPoints = dataSize/nbOfClusters*CLUSTER_SIZE_PERCENTAGE;
if (counter<minNearPoints){
return true;
}
}
return false;
}
int main()
{
char dummy[1];
scanf("%c",&dummy[0]);
nbOfClusters=dummy[0]-'0';
while ( scanf("%lf,%lf", &in1[dataSize], &in2[dataSize]) != EOF){
dataSize++;
}
//finding the maximums to determine the error toleration delta
for(int i =0; i< dataSize; i++){
if(in1[i]>0 && in1[i] > maxX1){
maxX1=in1[i];
}
if(in2[i]>0 && in2[i]>maxY1){
maxY1=in2[i];
}
if(in1[i]<0 && in1[i] < maxX1){
maxX2=in1[i];
}
if(in2[i]<0 && in2[i] < maxY1){
maxY2=in2[i];
}
}
//double minDistance = CENTROID_DISTANCE_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
double minDistance = 1/nbOfClusters*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
double pointMinDistance = CENTROID_POINT_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
/*
do { //randomly generate centroids but have finally nothing near
for(int i=0; i<nbOfClusters; i++){
centers[i].x=(double)rand()/RAND_MAX*2*(maxX1-maxX2)-(maxX1-maxX2);
centers[i].y=(double)rand()/RAND_MAX*2*(maxY1-maxY2)-(maxY1-maxY2);
}
//} while(isCentroidsNear(minDistance) || isCountNearPoints(pointMinDistance));
} while(isCentroidsNear(minDistance) || isPointNear(pointMinDistance));
//} while(isCentroidsNear(minDistance));
*/
int randomInputs[50];
bool isSame;
//generating nbOfClusters amount of random numbers from dataSize range that will later used to pick inputs
do {
do{
//generate random numbers
for(int i=0; i<nbOfClusters; i++){
randomInputs[i]=(int)((double)rand()/RAND_MAX*dataSize);
}
isSame = false;
//checking if the generated numbers are the same
for(int i=0; i<nbOfClusters-1; i++){
for(int j=i+1; j<nbOfClusters; j++){
if(randomInputs[i]==randomInputs[j] ){
isSame=true;
break;
}
}
if(isSame){
break;
}
}
}while(isSame);
//assign centroids to the generated numbers
for (int i =0;i<nbOfClusters;i++){
centers[i].x=in1[randomInputs[i]];
centers[i].y=in2[randomInputs[i]];
}
}while(isCentroidsNear(minDistance)); //if the centroids are too close, i.e. in the same cluster
//learning
point p1;//point for iteration
for (int ii=0; ii<TRAINING_CYCLE; ii++){
for (int i=0; i<dataSize; i++){
//construct a point
p1.x=in1[i];
p1.y=in2[i];
//find the nearest point and the distance to it
int nearPt=findNearestCentroid(p1);
double distance=getDistance(p1,centers[nearPt]);
//the distance that I want to move it
double deltaDistance=LEARNING_RATE*distance;
//moving the center on the DIRECTION of the other point
//the slope of the line passing through both
double slope=(in2[i]-centers[nearPt].y)/(in1[i]-centers[nearPt].x);
double dx,dy;
// finding how much the x needs to change => totalchange^2=dx^2+dy^2 but I know dy from dx
dx=sqrt(deltaDistance*deltaDistance/(1+slope*slope)); //dx=(totaldist^2/(1+slope^2)
//dx is always positive till now, so it should be neg. if the center is to the right of the point
if(centers[nearPt].x>in1[i]){
dx=(-1)*dx;
}
dy=slope*dx;
//updating the center value
centers[nearPt].x += dx;
centers[nearPt].y += dy;
}
}
//printing the results
for (int i=0; i<nbOfClusters; i++){
printf("%lf,%lf\n",centers[i].x,centers[i].y);
}
return 0;
}