Trying to accomplish K-Means in Python using aggregated data files. For example, instead of a data frame with 3 records represented by 3 rows, one row will represent all 3 with a column like cnt (arbitrarily named) representing those 3 unique instances with the number 3 in it.
Below is a set of some basic starter code that does NOT use the aggregated representation of the rows. Please let me know if you would like for me to post the .csv too, but it should be pretty basic:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
data = pd.read_csv('../Data/wholesale_data.csv')
data.head()
categorical_features = ['Channel', 'Region']
continuous_features = ['Fresh', 'Milk', 'Grocery', 'Frozen',
'Detergents_Paper', 'Delicassen']
for col in categorical_features: #for each categorical col
dummies = pd.get_dummies(data[col], prefix=col) #one-hot-encoding
data = pd.concat([data, dummies], axis=1) #append to data
data.drop(col, axis=1, inplace=True) #drop orig column
data.head()
mms = MinMaxScaler()
mms.fit(data)
data_transformed = mms.transform(data)
sum_of_squared_distances = []
K = range(1,15)
for k in K:
km = KMeans(n_clusters=k) #init model
km = km.fit(data_transformed) #fit model
sum_of_squared_distances.append(km.inertia_) #overall SSE
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()