I have a code as following:
import pandas as pd
import os
import jellyfish
import numpy as np
a = {'c' : ['dog', 'cat', 'tree','slow','fast','hurry','hello', 'world', 'germany', 'france','rahul', 'india',
'pakisthan', 'bangla', 'australia','newzealand', 'united kingdom', 'france','spain', 'belgium',
'bangladesh', 'west indies','USA','canada','afghanisthan','columbia','tamilnadu','telangana','hyderabad',
'khanapur', 'warangal']}
df = pd.DataFrame(a)
I have the following code in class structure:
class Distance:
def __init__(self, partitions):
self.partitions = partitions
def partitionlist(self, list_ofelements, num_of_divisions):
for i in range(0, list_ofelements.size, num_of_divisions):
yield list_ofelements[i:i + num_of_divisions]
def fuzzy_match(self, terms):
if len(terms) == 1:
return 1
return jellyfish.jaro_winkler((*terms))
def distance_measure(self, x, y):
term1 = np.repeat(x,len(y))
term2 = np.tile(y,(len(x),1)).flatten()
terms = list(zip(term1,term2))
resu = [self.fuzzy_match(frozenset(t)) for t in terms]
final_ = np.reshape(resu, (len(x),len(y)))
return np.around(final_.astype('float16'),2)
def dist_calculation(self, list_of_companies):
for index,item in enumerate(list_of_companies):
for i in range(len(list_of_companies)):
files = os.listdir()
filename = 'result_for_partition' + str(index) + str('_') + str(i) + '.npy'
if not filename in files:
if index <= i:
print('for partition: ' + str(index) + str('_') + str(i))
partition_result = self.distance_measure(list_of_companies[index],list_of_companies[i])
np.save(filename, partition_result)
return self
def read_distances(self, list_of_companies):
files = os.listdir()
fullfilename = 'jaro_distance' + '.npy'
if not fullfilename in files:
arr = None
for index,item in enumerate(list_of_companies):
row = None
for i in range(len(list_of_companies)):
if i == 0 and index <= i:
filename = 'result_for_partition' + str(index)+ str('_') + str(i) + str('.npy')
row = np.load(filename)
#print(row)
elif index <= i:
#print('elif')
print(index,i)
filename = 'result_for_partition' + str(index) + str('_') + str(i) + str('.npy')
block = np.load(filename)
row = np.hstack((row,block))
#print(row)
elif i==0 and index > i:
print(index,i)
filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
row = np.load(filename).T
#print(row)
else:
print(index,i)
filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
block = np.load(filename).T
row = np.hstack((row,block))
#print(row)
if index > 0:
row = np.concatenate((previous_rows, row), axis =0)
print(row.shape)
previous_rows = row
if index == len(list_of_companies)-1:
matrix = row
np.save(fullfilename, matrix)
print("saved in:")
print(fullfilename)
distances = 1 - np.load(fullfilename)
else:
distances = 1 - np.load(fullfilename)
return distances
def dist_matrix(self, dataset):
company_names = dataset['c'].unique()
print('length of unique company name:',company_names.size)
elements_list = list(self.partitionlist(company_names, self.partitions))
print(elements_list)
self.dist_calculation(elements_list)
distance_mat = self.read_distances(elements_list)
return distance_mat
obj = Distance(partitions = 6)
matrix = obj.dist_matrix(dataset = df)
I am actually try to identify similarity between texts. Here i used a toy dataset but in my case i have a huge text data where i have to find the similarity. I could not fit the entire data in RAM and decided to divide the data and construct the distance matrix. But it is taking too much of time and wanted to run it faster by using multiprocessing. I tried few methods but i am going into a deadlock situation. Also, I found ray library but could not integrate it to this code(tried but failed to reduce the time). Is there any way of optimizing it in terms of processing.