I would like to cluster a dataset into 2 parts which are fraud and non-fraud. To do that I used DBSCAN however I received following error. "labels_true and labels_pred must have same size, got 7200 and 28789 "
I would very pleased if you could help me. Lines in the below are written for csv reading.
import pandas as pd
import datetime
df=pd.read_csv('C:\\Users\\canberk.cinar\\Desktop\\banksim2.csv')
labels=df.fraud.values
labels=labels.reshape(-1,)
print(labels)
df.drop(['fraud'],axis=1,inplace=True)
type(df)
df.head()
# Import the scaler
from sklearn.preprocessing import MinMaxScaler
# Transform df into a numpy array
X = np.array(df).astype(np.float)
np.all(np.isfinite(X))
np.any(np.isnan(X))
print(np.any(np.isinf(X)))
# Define the scaler and apply to the data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
# Import DBSCAN
from sklearn.cluster import DBSCAN
from sklearn.metrics.cluster import homogeneity_score
from sklearn.metrics.cluster import silhouette_score
print(np.any(np.isinf(X_scaled)))
print(np.any(np.isnan(X_scaled)))
print(type(X_scaled))
data = X_scaled[np.logical_not(np.isnan(X_scaled))]
print(np.any(np.isnan(data)))
data=data.reshape(-1,1)
# Initialize and fit the DBscan model
db = DBSCAN(eps=0.9, min_samples=1, n_jobs=-1).fit(data)
print(len(X_scaled))
print(len(labels))
print(len(pred_labels))
print(labels.shape)
print(pred_labels.shape)
# Obtain the predicted labels and calculate number of clusters
pred_labels = db.labels_
print(db.labels_)
n_clusters = len(set(pred_labels)) - (1 if -1 in labels else 0)
# Print performance metrics for DBscan
print('Estimated number of clusters: %d' % n_clusters)
print("Homogeneity: %0.3f" % homogeneity_score(labels, pred_labels))
print("Silhouette Coefficient: %0.3f" % silhouette_score(data, pred_labels))
Then as I said, I got the following error ;
labels_true and labels_pred must have same size, got 7200 and 28789