1

I want to add a kind of "spheres" to my data cluster.

My data cluster is this, which does not have ""spheres".

enter image description here

And this is my code

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
from sklearn.cluster import KMeans

MY_FILE='total_watt.csv'
date = []
consumption = []

df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
df = df.dropna()

date = df.index.tolist()
date = [x.strftime('%Y-%m-%d') for x in date]
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
date_numeric = encoder.fit_transform(date)
consumption = df[df.columns[0]].values

X = np.array([date_numeric, consumption]).T

kmeans = KMeans(n_clusters=3)
kmeans.fit(X)

centroids = kmeans.cluster_centers_
labels = kmeans.labels_

print(centroids)
print(labels)

fig, ax = plt.subplots(figsize=(10,8))
rect = fig.patch
rect.set_facecolor('#2D2B2B')



colors = ["b.","r.","g."]

for i in range(len(X)):
    print("coordinate:",encoder.inverse_transform(X[i,0].astype(int)), X[i,1], "label:", labels[i])
    ax.plot(X[i][0], X[i][1], colors[labels[i]], markersize = 10)
ax.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
a = np.arange(0, len(X), 5)
ax.set_xticks(a)
ax.set_xticklabels(encoder.inverse_transform(a.astype(int)))
ax.tick_params(axis='x', colors='lightseagreen')
ax.tick_params(axis='y', colors='lightseagreen')
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=100, c="black", linewidths = 5, zorder = 10)
ax.set_title('Energy consumptions Clusters (high/medium/low)', color='gold')
ax.set_xlabel('time', color='gold')
ax.set_ylabel('date(year 2011)', color='gold')


plt.show()

"Spheres" is area which surroundings plot(cluster), as this picture.

enter image description here

I tried to google it.

But when I type "matplotlib spheres", I could not get any result..

Jianxun Li
  • 24,004
  • 10
  • 58
  • 76
Suzuki Soma
  • 519
  • 1
  • 8
  • 16

1 Answers1

1

The sample graph in your post looks like resulting from Generalized Gaussian Mixture where each sphere is a Gaussian 2-d density.

I'll write up a sample code shortly to demonstrate how to use GMM on your dataset and do this kind of plotting.

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import pandas as pd
# code changes here
# ===========================================
from sklearn.mixture import GMM
# ===========================================
from sklearn.preprocessing import LabelEncoder

# replace it with you file path
MY_FILE='/home/Jian/Downloads/total_watt.csv'

df = pd.read_csv(MY_FILE, parse_dates=[0], index_col=[0])
df = df.resample('1D', how='sum')
df = df.dropna()

date = df.index.tolist()
date = [x.strftime('%Y-%m-%d') for x in date]

encoder = LabelEncoder()
date_numeric = encoder.fit_transform(date)
consumption = df[df.columns[0]].values

X = np.array([date_numeric, consumption]).T


# code changes here
# ===========================================
gmm = GMM(n_components=3, random_state=0)
gmm.fit(X)
y_pred = gmm.predict(X)

# the center is given by mean
gmm.means_

# ===========================================

import matplotlib as mpl
fig, ax = plt.subplots(figsize=(10,8))

for i, color in enumerate('rgb'):
    # sphere background
    width, height = 2 * 1.96 * np.sqrt(np.diagonal(gmm._get_covars()[i]))
    ell = mpl.patches.Ellipse(gmm.means_[i], width, height, color=color)
    ell.set_alpha(0.1)
    ax.add_artist(ell)
    # data points
    X_data = X[y_pred == i]
    ax.scatter(X_data[:,0], X_data[:,1], color=color)
    # center
    ax.scatter(gmm.means_[i][0], gmm.means_[i][1], marker='x', s=100, c=color)


ax.set_title('Energy consumptions Clusters (high/medium/low)', color='gold')
ax.set_xlabel('time', color='gold')
ax.set_ylabel('date(year 2011)', color='gold')
a = np.arange(0, len(X), 5)
ax.set_xticks(a)
ax.set_xticklabels(encoder.inverse_transform(a.astype(int)))
ax.tick_params(axis='x', colors='lightseagreen')
ax.tick_params(axis='y', colors='lightseagreen')

enter image description here

Jianxun Li
  • 24,004
  • 10
  • 58
  • 76
  • @SuzukiSoma Just updated my post. Please have a look. :-) – Jianxun Li Jul 10 '15 at 14:03
  • @SuzukiSoma You are most welcome. Glad that it helped. If you wish to spend some time on learning `sci-kit`, you can start with its online user guide http://scikit-learn.org/stable/user_guide.html Many useful concepts and example codes are available there. – Jianxun Li Jul 10 '15 at 14:38
  • By the way, why you set 2 * 1.96?? – Suzuki Soma Jul 10 '15 at 15:04
  • @SuzukiSoma both are fine. the goal is to set a two-sided confidence interval. 1.96 is 2 standard deviation of Gaussian distribution. Multiplying a further 2 makes it two-sided. – Jianxun Li Jul 10 '15 at 15:06
  • You could do that also with `Kmeans`, calculating the standard deviations in `x` and `y` directions between points in every cluster and their centre of mass, then plotting corresponding ellipses. – rth Jul 10 '15 at 15:29