-1

I am asking this question for a homework on vector quantization.

I have implemented a rather classic algorithm to detect the center of a cluster of points. However in the input data there are several clusters (number of clusters and total inputs are known) and I need to find the center of each cluster but I don't know which points make a cluster. So if I manage to initialize my future center points inside or somewhere near the cluster (closer than any other initialized centers), my algorithm can iterate and go to the correct center.

However I don't know how to properly initialize. I am initializing randomly and checking if two centers are too close to each other and if a center is too far from any input point but this method isn't easy to parametrize, i.e. taking to much "computing" time or not getting the right centers.

My idea is simple, initialize randomly and check whether the point is inside a cluster. Someone knows how can I do that? I cannot construct a polygon since I don't know the limits of the cluster. I would prefer an implementation in C but I take just the ideas as well!

Edit: An example of the input data:

Example Data: The red  points are what I should get as a result

My Code:

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <float.h>

#define TRAINING_CYCLE 10000
#define LEARNING_RATE  0.001
#define CENTROID_DISTANCE_SCALE 0.7  //used for setting a minimal distance between centroids
#define CENTROID_POINT_SCALE 0.1
#define CLUSTER_SIZE_PERCENTAGE 0.3

//User:      REMOVED
//Password:  REMOVED

typedef enum { false, true } bool;

typedef struct point{
    double x;
    double y;
} point;

int nbOfClusters;

double in1[1000];
double in2[1000];


point centers[100]; //later it is limited by number of clusters

int dataSize=0;
double maxX1, maxY1, maxX2, maxY2=0; //maximums of each data set
double deltaX, deltaY=0; //error toleration of each axis

double getAbs(double n){
    if (n>=0){
        return n;
    } else {
        return (-1)*n;
    }
}

int findNearestCentroid(point p1){ //returns the location in the table of the nearest centroid to the argument point
    double distance=DBL_MAX;
    int nearest=0;
    for (int i=0; i<nbOfClusters; i++){
        double distance_temp = (p1.x-centers[i].x)*(p1.x-centers[i].x)+(p1.y-centers[i].y)*(p1.y-centers[i].y);
        if ( distance_temp < distance){
            distance=distance_temp;
            nearest=i;
        }
    }
    return nearest;
}

double getDistance(point p1, point p2){
    return sqrt((p1.x-p2.x)*(p1.x-p2.x)+(p1.y-p2.y)*(p1.y-p2.y));
}

bool isCentroidsNear(double minDistance){

    for (int i=0;i<nbOfClusters;i++){
        for (int j=0; j<nbOfClusters; j++){
            if (i != j){
                double temp_distance=getDistance(centers[i],centers[j]);
                if (temp_distance<minDistance){ // the distance shouldn't be small
                    return true;
                }
            }
        }
    }
    return false; //if nothing hit the condition, there is no centroid too close to another
}
point findNearestInput(int centroid){ //returns the location in the table of the nearest centroid to the argument point
    double distance=DBL_MAX;
    point returnPoint;
    int nearest=0;
    for (int i=0; i<nbOfClusters; i++){
        double distance_temp = (in1[i]-centers[centroid].x)*(in1[i]-centers[centroid].x)+(in2[i]-centers[centroid].y)*(in2[i]-centers[centroid].y);
        if ( distance_temp < distance){
            distance=distance_temp;
            nearest=i;
        }
    }
    returnPoint.x=in1[nearest];
    returnPoint.y=in2[nearest];
    return returnPoint;
}

bool isPointNear(double minDistance){
    for(int i=0; i<nbOfClusters; i++){
        double distance=getDistance(findNearestInput(i),centers[i]); //the distance to the nearest point
        if(distance>minDistance){
            return true;
        }
    }
    return false;
}

bool isCountNearPoints(double distance){
    int counter=0;
    for(int i=0;i<nbOfClusters;i++){
        point p;
        for(int j=0; j<dataSize; j++){
            p.x=in1[j];
            p.y=in2[j];
            double tempDistance=getDistance(p,centers[i]);
            if (tempDistance<distance){
                counter++;
            }
        }
        //this is the number of points that the centroid should be near to
        int minNearPoints = dataSize/nbOfClusters*CLUSTER_SIZE_PERCENTAGE;
        if (counter<minNearPoints){
            return true;
        }
    }
    return false;
}

int main()
{
    char dummy[1];
    scanf("%c",&dummy[0]);
    nbOfClusters=dummy[0]-'0';



    while ( scanf("%lf,%lf", &in1[dataSize], &in2[dataSize]) != EOF){
        dataSize++;
    }


   //finding the maximums to determine the error toleration delta

    for(int i =0; i< dataSize; i++){
        if(in1[i]>0 && in1[i] > maxX1){
            maxX1=in1[i];
        }
        if(in2[i]>0 && in2[i]>maxY1){
            maxY1=in2[i];
        }
        if(in1[i]<0 && in1[i] < maxX1){
            maxX2=in1[i];
        }
        if(in2[i]<0 && in2[i] < maxY1){
            maxY2=in2[i];
        }
    }

    //double minDistance = CENTROID_DISTANCE_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
    double minDistance = 1/nbOfClusters*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));
    double pointMinDistance = CENTROID_POINT_SCALE*sqrt((maxX1-maxX2)*(maxX1-maxX2)+(maxY1-maxY2)*(maxY1-maxY2));

/*
    do { //randomly generate centroids but have finally nothing near
        for(int i=0; i<nbOfClusters; i++){
            centers[i].x=(double)rand()/RAND_MAX*2*(maxX1-maxX2)-(maxX1-maxX2);
            centers[i].y=(double)rand()/RAND_MAX*2*(maxY1-maxY2)-(maxY1-maxY2);
        }
    //} while(isCentroidsNear(minDistance) || isCountNearPoints(pointMinDistance));
    } while(isCentroidsNear(minDistance) || isPointNear(pointMinDistance));
    //} while(isCentroidsNear(minDistance));
    */
    int randomInputs[50];
    bool isSame;
    //generating nbOfClusters amount of random numbers from dataSize range that will later used to pick inputs
    do {
        do{
            //generate random numbers
            for(int i=0; i<nbOfClusters; i++){
                randomInputs[i]=(int)((double)rand()/RAND_MAX*dataSize);
            }
            isSame = false;
            //checking if the generated numbers are the same
            for(int i=0; i<nbOfClusters-1; i++){
                for(int j=i+1; j<nbOfClusters; j++){
                    if(randomInputs[i]==randomInputs[j] ){
                        isSame=true;
                        break;
                    }
                }
                if(isSame){
                    break;
                }
            }

        }while(isSame);
        //assign centroids to the generated numbers
        for (int i =0;i<nbOfClusters;i++){
            centers[i].x=in1[randomInputs[i]];
            centers[i].y=in2[randomInputs[i]];
        }
    }while(isCentroidsNear(minDistance)); //if the centroids are too close, i.e. in the same cluster
    //learning
    point p1;//point for iteration

    for (int ii=0; ii<TRAINING_CYCLE; ii++){
        for (int i=0; i<dataSize; i++){

            //construct a point
            p1.x=in1[i];
            p1.y=in2[i];

            //find the nearest point and the distance to it
            int nearPt=findNearestCentroid(p1);
            double distance=getDistance(p1,centers[nearPt]);

            //the distance that I want to move it
            double deltaDistance=LEARNING_RATE*distance;

            //moving the center on the DIRECTION of the other point
            //the slope of the line passing through both
            double slope=(in2[i]-centers[nearPt].y)/(in1[i]-centers[nearPt].x);

            double dx,dy;
            // finding how much the x needs to change => totalchange^2=dx^2+dy^2 but I know dy from dx
            dx=sqrt(deltaDistance*deltaDistance/(1+slope*slope)); //dx=(totaldist^2/(1+slope^2)


            //dx is always positive till now, so it should be neg. if the center is to the right of the point
            if(centers[nearPt].x>in1[i]){
                dx=(-1)*dx;
            }
            dy=slope*dx;
            //updating the center value
            centers[nearPt].x += dx;
            centers[nearPt].y += dy;

        }

    }

    //printing the results

    for (int i=0; i<nbOfClusters; i++){
        printf("%lf,%lf\n",centers[i].x,centers[i].y);
    }

    return 0;
}
Has QUIT--Anony-Mousse
  • 76,138
  • 12
  • 138
  • 194
Ege Korkan
  • 134
  • 1
  • 10

1 Answers1

0

The usual approach is to choose points from the existing data, rather than uniform random.

Since in your data model, every point belongs to a cluster, choosing existing points solves your (vague) problem, doesn't it?

Has QUIT--Anony-Mousse
  • 76,138
  • 12
  • 138
  • 194