I'm having trouble using Tensorflow to solve a classification problem
I have a dataset with 259,514 rows, classified into the following groups:
Groups:
[0,1) - 171.646 rowns
[1,55) - 17.262 rowns
[55,105) - 17.320 rowns
[105,170) - 17.392 rowns
[170,285) - 18092 rowns
[285, Inf] - 17802 rowns
I tried to balance the data leaving only about 18,000 rows for each group.
The data were then divided into 70,869 training lines and 34,907 test lines.
The network was trained with 20 epochs, the acuracy was 0.54831, when generating the matrix of confusion (Train x Test) I obtained the following result:
labels=["[0,1)", "[1,55)","[55,105)", "[105,170)", "[170,285)","[285, Inf]"])
array([[5019, 215, 78, 182, 189, 136],
[ 224, 3260, 807, 522, 331, 546],
[ 258, 1076, 2236, 926, 504, 744],
[ 273, 492, 592, 2534, 965, 997],
[ 238, 375, 394, 819, 2555, 1549],
[ 134, 249, 347, 418, 821, 3902]])
It seems OK to me.
But when I finally try to predict my data, I have all the lines in a single group:
ix
Y_pred
[1,55) 3
[105,170) 3
[170,285) 8
[285, Inf] 788286
I've already tried:
Train the network with the entire DataSet
Do not normalize data
Normalize data with StandardScaler, MinMaxScaler, RobutsScaler
Train the model for 500 epochs
Add or remove layers and neurons from the network
I really do not know what might be causing this, can anyone help me?
from google.colab import drive
drive.mount('/content/gdrive')
#Bibliotecas
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
import numpy as np
import pandas as pd
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
#Equili
dataset = dataset.groupby('vl_baixa_group', group_keys=False).apply(lambda x: x.sample(min(len(x), 18000)))
dataset.groupby(['vl_baixa_group']).count()
##Normalize
from sklearn import preprocessing
x = dataset.select_dtypes(include='int64')
StandardScaler = preprocessing.StandardScaler()
x_scaled = StandardScaler.fit_transform(x)
x_new = pd.DataFrame(x_scaled, columns=x.columns, index=x.index)
dataset_norm = pd.concat([x_new,dataset.select_dtypes(include='category')], join="inner", axis=1)
#Dumyes
dataset_norm = pd.get_dummies(dataset_norm, columns= (dataset_norm.select_dtypes(include='category')).columns.difference(['vl_baixa_group',]))
#DataFrame Numpy
dataframe = dataset_norm.to_numpy()
#Y
Y = dataframe[:,dataset_norm.columns.get_loc("vl_baixa_group")]
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
Y = np_utils.to_categorical(encoded_Y)
#X
X = dataframe[:,(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45)]
#Z
Z = (dataset_norm.index).to_numpy()
# seed for reproducing same results
seed = 9
np.random.seed(seed)
# split the data into training (67%) and testing (33%)
(X_train, X_test, Y_train, Y_test, Z_train, Z_test) = train_test_split(X, Y, Z, test_size=0.33, random_state=seed)
#Z_test.shape
unique_elements, counts_elements = np.unique(encoder.inverse_transform(np.argmax( Y_test ,axis=1)) , return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))
#NetWork
from keras.callbacks import *
import tensorflow as tf
from tensorflow import keras
#Check Point
checkpoint_path = "/content/gdrive/My Drive/Colab Notebooks/Models/2019_05_07_keras_tensor_deacts/training_cp/cp_epochs2:{epoch:03d}-val_acc:{val_acc:.3f}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
# Create checkpoint callback
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
save_best_only=True,
monitor='val_acc',
mode='max',
verbose=1)
# create the model
model = Sequential()
model.add(Dense(64, input_dim=45, init='random_normal', activation='relu'))
model.add(Dense(32, init='random_normal', activation='relu'))
model.add(Dense(6, init='random_normal', activation='softmax'))
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#Fiting
# fit the model
history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), nb_epoch=20, batch_size=5, verbose=1, callbacks = [cp_callback])
# evaluate the model
scores = model.evaluate(X_test, Y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))
#Salvando Modelo
model.save('/content/gdrive/My Drive/Colab Notebooks/Models/2019_05_07_keras_tensor_deacts/model.h5')
#Confusion Matrix
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix( encoder.inverse_transform(np.argmax( Y_test ,axis=1))
,encoder.inverse_transform(np.argmax( y_pred ,axis=1))
,labels=["[0,1)", "[1,55)","[55,105)", "[105,170)", "[170,285)","[285, Inf]"])
cm
#------ Predict real Data
#Normalize
from sklearn import preprocessing
x_pred = dataset_pred.select_dtypes(include='int64')
x_scaled_pred = StandardScaler.transform(x_pred)
x_new_pred = pd.DataFrame(x_scaled_pred, columns=x_pred.columns, index=x_pred.index)
dataset_norm_pred = pd.concat([x_new_pred,dataset_pred.select_dtypes(include='category')], join="inner", axis=1)
dataset_norm_pred.head()
# Dumyes
dataset_norm_pred = pd.get_dummies(dataset_norm_pred, columns= (dataset_norm_pred.select_dtypes(include='category')).columns.difference(['vl_baixa_group',]))
dataset_norm_pred.describe()
dataset_norm.describe()
# DataFrame Numpy
dataframe_pred = dataset_norm_pred.to_numpy()
# X
X_Pred = dataframe_pred[:,(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44)]
# Z
Z_Pred = (dataset_norm_pred.index).to_numpy()
pred = model.predict(X_Pred)
pred
pred_t = np.column_stack((Z_Pred
,encoder.inverse_transform(np.argmax( pred ,axis=1))))
other_pred = pd.DataFrame(data=pred_t[1:,0:], # values
index=pred_t[1:,0], # 1st column as index
columns=['ix','Y_pred']
) # 1st row as the column names
#other_pred.head()
other_pred.groupby(['Y_pred']).count()