I'm playing around with DBSCAN. I'm wondering why the execution time decreases as I increase the number of features (see plot below). I would expect execution time to increase as the number of features increases...
import timeit
import functools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN
features = [2, 4, 8, 10]
training_examples = [100, 500, 1000,2000]
n_iterations = 10
x = np.asarray(training_examples)
for num_features in features:
average_execution_time = []
for num_training_examples in training_examples:
# generate matrix of random training examples
X = np.random.rand(num_training_examples, num_features)
# generate a symmetric distance matrix
D = euclidean_distances(X, X)
# DBSCAN parameters
eps = 0.5
kmedian_thresh = 0.005
min_samples = 5
db = DBSCAN(eps=eps,
min_samples=min_samples,
metric='precomputed')
# Call timeit
t = timeit.Timer(functools.partial(db.fit, D))
average_execution_time.append(t.timeit(n_iterations) / n_iterations)
y = np.asarray(average_execution_time)
plt.plot(x, y, label='{} features'.format(num_features))
plt.xlabel('No. of Training Examples')
plt.ylabel('DBSCAN.fit() time to Cluster')
plt.title('DBSCAN.fit() avg time to Cluster')
plt.legend()
plt.grid()
plt.show()