-1

I am trying to implement GMM Clustering for both 24 Dimension feature vector and 32 dimension feature vector, where assignment of initial parameters are done by Kmeans algorightm (K mean clustering is providing cluster centers - MU - only). I am following this link, where it's implemented only for 2D feature vector and predefined Mu and sigma.

If anyone have the code for GMM clustering kindly post.

Predefined Lib for GMM is also there in sklearn, but it's not giving me likelyhood for each iteration. sklearn GMM

Debashis Sahoo
  • 5,388
  • 5
  • 36
  • 41
  • I am working on this, soon I will post my own answer. – Debashis Sahoo Oct 25 '18 at 09:40
  • hi, I was wondering if you could have a look at a similar problem in my [post](https://stackoverflow.com/questions/63414169/how-can-implement-em-gmm-in-python) Thanks in advance. – Mario Aug 14 '20 at 13:57

1 Answers1

0
def kmeans(dataSet, k, c):
    # 1. Randomly choose clusters
    rng = np.random.RandomState(c)
    p = rng.permutation(dataSet.shape[0])[:k]
    centers = dataSet[p]

    while True:
        labels = pairwise_distances_argmin(dataSet, centers)
        new_centers = np.array([dataSet[labels == i].mean(0) for i in range(k)]
        if np.all(centers == new_centers):
            break
        centers = new_centers
    cluster_data = [dataSet[labels == i] for i in range(k)]
    l = []
    covs = []
    for i in range(k):
        l.append(len(cluster_data[i]) * 1.0 / len(dataSet))
        covs.append(np.cov(np.array(cluster_data[i]).T))
    return centers, l, covs, cluster_data


return new_mu, new_covs, cluster_data


class gaussian_Mix_Model:

    def __init__(self, k = 8, eps = 0.0000001):
        self.k = k ## number of clusters
        self.eps = eps ## threshold to stop `epsilon`


    def calculate_Exp_Maxim(self, X, max_iters = 1000):

        # n = number of data-points, d = dimension of data points        
        n, d = X.shape

        mu, Cov = [], []
        for i in range(1,k):
            new_mu, new_covs, cluster_data = kmeans(dataSet, k, c)
            # Initialize new         
            mu[k] = new_mu
            Cov[k]= new_cov

            # initialize the weights
            w = [1./self.k] * self.k

            R = np.zeros((n, self.k))

            ### LLhoods
            LLhoods = []

            P = lambda mu, s: np.linalg.det(s) ** -.5 ** (2 * np.pi) ** (-X.shape[1]/2.) \
                * np.exp(-.5 * np.einsum('ij, ij -> i',\
                        X - mu, np.dot(np.linalg.inv(s) , (X - mu).T).T ) ) 

            # Iterate till max_iters iterations        
            while len(LLhoods) < max_iters:

            # Expectation Calcultion 

            ## membership for each of K Clusters
            for k in range(self.k):
                R[:, k] = w[k] * P(mu[k], Cov[k])

            # Finding the log likelihood
            LLhood = np.sum(np.log(np.sum(R, axis = 1)))

            # Now store the log likelihood to the list. 
            LLhoods.append(LLhood)

            # Number of data points to each clusters
            R = (R.T / np.sum(R, axis = 1)).T                   
            N_ks = np.sum(R, axis = 0)


            # Maximization and calculating the new parameters. 
            for k in range(self.k):

                # Calculate the new means
                mu[k] = 1. / N_ks[k] * np.sum(R[:, k] * X.T, axis = 1).T
                x_mu = np.matrix(X - mu[k])

                # Calculate new cov
                Cov[k] = np.array(1 / N_ks[k] * np.dot(np.multiply(x_mu.T,  R[:, k]), x_mu))

                # Calculate new PiK
                w[k] = 1. / n * N_ks[k]
            # check for convergence
            if (np.abs(LLhood - LLhoods[-2]) < self.eps) and (iteration < max_iters): break
            else:
                Continue

    from collections import namedtuple
    self.params = namedtuple('params', ['mu', 'Cov', 'w', 'LLhoods', 'num_iters'])
    self.params.mu = mu
    self.params.Cov = Cov
    self.params.w = w
    self.params.LLhoods = LLhoods
    self.params.num_iters = len(LLhoods)       

    return self.params

# Call the GMM to find the model 
gmm = gaussian_Mix_Model(3, 0.000001)
params = gmm.fit_EM(X, max_iters= 150)

# Plotting of Log-Likelihood VS Iterations. 
plt.plot(LLhoods[0])
plt.savefig('Dataset_2A_GMM_Class_1_K_16.png')
plt.clf()
plt.plot(LLhoods[1])
plt.savefig('Dataset_2A_GMM_Class_2_K_16.png')
plt.clf()
plt.plot(LLhoods[2])
plt.savefig('Dataset_2A_GMM_Class_3_K_16.png')
plt.clf()
Debashis Sahoo
  • 5,388
  • 5
  • 36
  • 41