so i have this code :
class distKmeans(beam.DoFn):
#i will do an init function to add the kmeans parameters
def __init__(self, n_clusters,rseed=2):
self.n_clusters = n_clusters
self.rseed = rseed
self.centers = None
#The function "process" implements the main functionality of the K-means algorithm
def process(self,element):
if self.centers is None:
rng = np.random.RandomState(self.rseed)
#we use len instead of shape because element is a PCOLLECTION
i = rng.permutation(element.shape[0])[:self.n_clusters]
self.centers = element[i]
# b1. Calculate the closest center μ to xi
labels = pairwise_distances_argmin(element, self.centers)
# b2. Update the center
new_centers = np.array([element[labels == i].mean(0)
for i in range(self.n_clusters)])
# c.
if np.all(self.centers == new_centers):
return
self.centers = new_centers
yield self.centers, labels
with beam.Pipeline() as pipeline:
mydata = pipeline | beam.Create(X)
mydata = mydata |beam.ParDo(distKmeans(3))
mydata |"write" >> beam.io.WriteToText("sample_data/output.txt")
as i'm trying to create a distributed kmeans with apache beam, my data was generated using this code :
n_samples=200
n_features=2
X, y = make_blobs(n_samples=n_samples,centers=3, n_features=n_features)
data = np.c_[X,y]
plt.scatter(data[:, 0], data[:, 1], s=50);
and then X is :
X = data[['X1','X2']].to_numpy()
X = X[1:]
it shape is (200, 2 )
The code seems correct but i always get the fellowing error even tho my data is a 2d array:
Expected 2D array, got 1D array instead:
array=[-6.03120913 11.30181549].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample. [while running '[54]: ParDo(distKmeans)']
and this error comes in this line :
labels = pairwise_distances_argmin(element, self.centers)