I'm trying to create a dataframe panda object from a list of Schools objects that contain a row of information. The problem is that it is taking hours to complete. I'm running this on a Jupyter notebook and after hour of running it crashes. I have an ordered list of School objects. The objects are as following:
class School:
def __init__(self, distance, row):
self.distance_to_origin = distance
self.row = row
self.name = row['name']
self.lat = row['lat']
self.lon = row['lon']
def get_distance(self):
return self.distance_to_origin
def get_lat_lon(self):
return [self.lat, self.lat]
def get_name(self):
return self.name
def get_row(self):
return self.row
def __str__(self):
return str(self.distance_to_origin)
def __repr__(self):
return str(self.distance_to_origin)
I'm then trying to create a pandas dataframe from this list. The overall goal is to remove duplicate schools. A duplicate schools is one that is within a 1600 and has a similar name.
The code that removes schools is the following:
def get_duplicates(ordered_list):
total_dups = 0;
newDataFrame = pd.DataFrame()
for i in trange(len(ordered_list)-1):
newDataFrame = newDataFrame.append(ordered_list[i].get_row())
ite = i+1
while( not (ite>(len(ordered_list)-1)) and abs(ordered_list[i].get_distance()-ordered_list[ite].get_distance())<1600):
if(vincenty(ordered_list[i].get_lat_lon(),ordered_list[ite].get_lat_lon()).meters<1600):
if(fuzzy_match(ordered_list[i].get_name(), ordered_list[ite].get_name())): #it's a match, dont add
total_dups +=1
else: # is within distane, name doesnt match
newDataFrame = newDataFrame.append(ordered_list[ite].get_row())
else: # it is not within distance
newDataFrame = newDataFrame.append(ordered_list[ite].get_row())
#print(newlist[ite].get_name())
#print( newlist[ite].get_lat_lon())
ite+=1
print(total_dups)
return newDataFrame
vincenty is from geopy.distance
fuzzy_match is:
stemmer = stem.PorterStemmer()
def normalize(s):
words = tokenize.wordpunct_tokenize(s.lower().strip())
return ' '.join([stemmer.stem(w) for w in words])
def fuzzy_match(s1, s2, max_dist=3):
return edit_distance(normalize(s1), normalize(s2)) <= max_dist
edit_distance is from nltk.metrics
What am I doing wrong that is causing this to take hours? Is there a way to optimize this? Thanks!