I am writing a function that calculates distances between training and testing instances. The distances is modified Manhattan distance. I have working code by it becomes too slow when the number of features (columns) increases. Any idea how could I speed it up?
import pandas as pd
import numpy as np
import time
import datetime
def make_random_distance():
"""Generates randomly populated pandas dataframe of a training dataset and a test dataset and caclulates and ret"""
df=pd.DataFrame(np.random.randint(0,50,size=(10000,1024)))
print(df.shape)
#Test dataset
test=pd.DataFrame(np.random.randint(0,50,size=(1,1024)))
Calculated_Distances=[]
#For each test instance
for ind,roll in test.iterrows():
print("Working on test instance {}".format(ind))
#print(ind,roll.values)
Test_inst = np.array(roll.values) #Features of test instance
#Dist = custom_distance_b(Test_inst, df)
Dist = custom_distance(Test_inst, df)
print("Done calculating distances")
print("Now sorting dictionary")
sorted_d = sorted(Dist.items(), key=operator.itemgetter(1))
# Now we examine the 5NN
for j in range(5):
index_com = sorted_d[j][0]
calc_dist = sorted_d[j][1]
Calculated_Distances.append([ind, index_com, calc_dist])
#writes out results
Calc_Dist=pd.DataFrame(Calculated_Distances,columns=['Test_indx','Training_indx','Distance'])
#Calc_Dist.to_csv("/home/Code/testing_distances.csv",sep=',',index=False)
print(Calc_Dist)
return
def custom_distance(i,df):
"""
:param i: test instance vector
:param df: training instances pandas data frame
:return:
"""
#First we need to caclulate the standard deviation for each descriptor (row)
# First caclulate standard deviations for each column (feature)
count_ind = 0
stad_dev = {}
for column in df:
stad_dev[count_ind] = df.iloc[:, column].std(axis=0)
count_ind+=1
Dist={}
for index,row in df.iterrows():
temp_dist=0
for j in range(len(row)):
dist=float(abs(row[j]-i[j])/(5*stad_dev[j]))
temp_dist+=min(dist,1.0)
#print(index,i.values,row.values,temp_dist)
Dist[index]=round(temp_dist,3)
return Dist
if __name__=="__main__":
T1=time.time()
make_random_distance()
T2=time.time()
t=T2-T1
print("Took {} seconds".format(t))
print("Took {}".format(str(datetime.timedelta(seconds=t))))
The current code calculates on my machine for a single test instance the distances against 10000 training instances with 1024 features/columns and retrieves the 5 Nearest Neighbors.
Took 128.5559959411621 seconds Took 0:02:08.555996
Any idea how I could speed it up? As I will need to calculate thousands of these calculations on my test sets.