I'm confused why do I'm getting a bad input shape error. The line that fails is
test_datapoint_encoded[i] = int(label_encoder[count].transform(test_datapoint[i]))
I've read here that apparently the transform function doesn't work with lists but I have used a different example and it works without errors. the shape of test_datapoint
and test_datapoint_encoded
are the same. I have tried using as numpy arrays but I got the same error.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, mean_absolute_error
from sklearn import cross_validation, preprocessing
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import classification_report
# Load input data
input_file = 'traffic_data.txt'
data = []
with open(input_file, 'r') as f:
for line in f.readlines():
items = line[:-1].split(',')
data.append(items)
data = np.array(data)
#convert string to numerical
label_encoder =[]
X_encoded = np.empty(data.shape)
for i, item in enumerate(data[0]):
if item.isdigit():
X_encoded[:,i] = data[:,i]
else:
label_encoder.append(preprocessing.LabelEncoder())
X_encoded[:,i] = label_encoder[-1].fit_transform(data[:,i])
#evrything but the last column
X = X_encoded[:, :-1].astype(int)
#only the last column
y = X_encoded[:, -1].astype(int)
#split into test and train
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.25, random_state=5)
#create extreme forest regressor
params = {'n_estimators': 100, 'max_depth': 4, 'random_state': 0}
ext_regressor = ExtraTreesRegressor(**params)
ext_regressor.fit(X_train, y_train)
# Compute the regressor performance on test data
y_pred = ext_regressor.predict(X_test)
print("Mean absolute error:", round(mean_absolute_error(y_test, y_pred),
2))
enter code here
# Testing encoding on single data instance
test_datapoint = ['Saturday', '10:20', 'Atlanta', 'no']
test_datapoint_encoded = [0, 0, 0, 0]
count = 0
for i, item in enumerate(test_datapoint):
if item.isdigit():
test_datapoint_encoded[i] = int(test_datapoint[i])
else:
test_datapoint_encoded[i] = int(label_encoder[count].transform(test_datapoint[i]))
count = count + 1
test_datapoint_encoded = np.array(test_datapoint_encoded)
# Predict the output for the test datapoint
print("Predicted traffic:",
int(ext_regressor.predict([test_datapoint_encoded])[0]))
I have used a simpler example of label encoder and didn't have any errors before.
import numpy as np
from sklearn import preprocessing
#definde sample labels
input_labels = ['red', 'black', 'red', 'green', 'black', 'yellow', 'white']
#creating label encoder and train it.
encoder = preprocessing.LabelEncoder()
#train my encoder to associate each color with a digit
encoder.fit(input_labels)
#print the mapping between words and numbers
print("\nLabel mapping:")
for i, item in enumerate(encoder.classes_):
print(item, '--->', i)
#encode a set of test labels and compare results
test_labels = ['green', 'red', 'black']
encoded_values = encoder.transform(test_labels)
print("\nLabels =", test_labels)
print("Encoded values =", list(encoded_values))
##ii can now decode the numbers into colors
encoded_test_values = [3, 0 ,4, 1]
decoded_list = encoder.inverse_transform(encoded_test_values)
print("\nEncoded values =", encoded_test_values)
print("Decoded labels =", list(decoded_list))