I am trying to execute a KNN algorithm from scratch, but I am getting a really strange error saying "KeyError: 0"
I assume this implying I have an empty dictionary somewhere, but I don't understand how that can be. I might just add for the sake of clarity that the data works fine in the black box KNN algorithm, so it definitely has to be something in the code...
This is my code:
import numpy as np
import pandas as pd
import csv
import scipy.stats as stats
import math
from collections import Counter
import operator
from operator import itemgetter
"""Training features dataset"""
filenametrain_data = 'training_data.csv'
training_feature_set = pd.read_csv(filenametrain_data, header=None, usecols=range(1,13627))
"""Training labels dataset"""
filenametrain_label = 'training_labels.csv'
training_feature_label = pd.read_csv(filenametrain_label, header=None, usecols=[1], names=['Category'])
"""Split into training and testing datasets 90%/10%"""
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(training_feature_set, training_feature_label, test_size = 0.1, random_state=42)
"""KNN Model"""
def distance(X_train, y_train):
dist = 0.0
for i in range(len(X_train)):
dist += pow((X_train[i] - y_train[i]), 2)
return math.sqrt(dist)
def getNeighbors(X_train, y_train, X_test, k):
distances = []
for i in range(len(X_train)):
dist = distance(X_test, X_train[i])
distances.append((X_train[i], dist, y_train[i]))
distances.sort(key=operator.itemgetter(1))
neighbor = []
for elem in range(k):
neighbor.append((distances[elem][0], distances[elem][2]))
return neighbor
def getResponse(neighbors):
classVotes = {}
for x in range(len(neighbors)):
response = int(neighbors[x][-1])
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse = True)
return sortedVotes[0][0]
"""Prediction"""
predictions = []
k = 4
for x in range(len(X_test)):
neighbors = getNeighbors(X_train, y_train, y_test[x], k)
result = getResponse(neighbors)
predictions.append(result)
The error returned is:
Traceback (most recent call last):
File "", line 2, in neighbors = getNeighbors(X_train, y_train, y_test[x], k)
File "C:\ANACONDA\lib\site-packages\pandas\core\frame.py", line 1797, in getitem return self._getitem_column(key)
File "C:\ANACONDA\lib\site-packages\pandas\core\frame.py", line 1804, in _getitem_column return self._get_item_cache(key)
File "C:\ANACONDA\lib\site-packages\pandas\core\generic.py", line 1084, in _get_item_cache values = self._data.get(item)
File "C:\ANACONDA\lib\site-packages\pandas\core\internals.py", line 2851, in get loc = self.items.get_loc(item)
File "C:\ANACONDA\lib\site-packages\pandas\core\index.py", line 1572, in get_loc return self._engine.get_loc(_values_from_object(key))
File "pandas\index.pyx", line 134, in pandas.index.IndexEngine.get_loc (pandas\index.c:3824)
File "pandas\index.pyx", line 154, in pandas.index.IndexEngine.get_loc (pandas\index.c:3704)
File "pandas\hashtable.pyx", line 686, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12280)
File "pandas\hashtable.pyx", line 694, in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12231)
KeyError: 0
The datasets can be accessed here