Multiprocessing for partitions of data

Question

I have a code as following:

import pandas as pd
import os
import jellyfish
import numpy as np
a = {'c' : ['dog', 'cat', 'tree','slow','fast','hurry','hello', 'world', 'germany', 'france','rahul', 'india',
           'pakisthan', 'bangla', 'australia','newzealand', 'united kingdom', 'france','spain', 'belgium',
           'bangladesh', 'west indies','USA','canada','afghanisthan','columbia','tamilnadu','telangana','hyderabad',
           'khanapur', 'warangal']}
df = pd.DataFrame(a)

I have the following code in class structure:

class Distance:
    def __init__(self, partitions):
        self.partitions = partitions

    def partitionlist(self, list_ofelements, num_of_divisions):
        for i in range(0, list_ofelements.size, num_of_divisions): 
            yield list_ofelements[i:i + num_of_divisions]


    def fuzzy_match(self, terms):
        if len(terms) == 1:
            return 1
        return jellyfish.jaro_winkler((*terms))

    def distance_measure(self, x, y):
        term1 = np.repeat(x,len(y))
        term2 = np.tile(y,(len(x),1)).flatten()
        terms = list(zip(term1,term2))
        resu =  [self.fuzzy_match(frozenset(t)) for t in terms]
        final_ = np.reshape(resu, (len(x),len(y)))
        return np.around(final_.astype('float16'),2)

    def dist_calculation(self, list_of_companies):
        for index,item in enumerate(list_of_companies):
            for i in range(len(list_of_companies)):
                files = os.listdir()
                filename = 'result_for_partition' + str(index) + str('_') + str(i) + '.npy'
                if not filename in files:
                    if index <= i:
                        print('for partition: ' + str(index) + str('_') + str(i))
                        partition_result = self.distance_measure(list_of_companies[index],list_of_companies[i])
                        np.save(filename, partition_result)
        return self

    def read_distances(self, list_of_companies):
        files = os.listdir()
        fullfilename = 'jaro_distance' + '.npy'
        if not fullfilename in files:
            arr = None
            for index,item in enumerate(list_of_companies):
                row = None
                for i in range(len(list_of_companies)):
                    if i == 0 and index <= i:
                        filename = 'result_for_partition' + str(index)+ str('_') + str(i) + str('.npy')
                        row = np.load(filename)
                        #print(row) 

                    elif index <= i:
                        #print('elif')
                        print(index,i)
                        filename = 'result_for_partition' + str(index) + str('_') + str(i) + str('.npy')
                        block = np.load(filename)
                        row = np.hstack((row,block))
                        #print(row)

                    elif i==0 and index > i:
                        print(index,i)
                        filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
                        row = np.load(filename).T
                        #print(row)                       

                    else:
                        print(index,i)
                        filename = 'result_for_partition' + str(i) + str('_') + str(index) + str('.npy')
                        block = np.load(filename).T
                        row = np.hstack((row,block))
                        #print(row)

                if index > 0:
                    row = np.concatenate((previous_rows, row), axis =0)
                    print(row.shape)
                previous_rows = row

                if index == len(list_of_companies)-1:
                    matrix = row
            np.save(fullfilename, matrix)
            print("saved in:")
            print(fullfilename)
            distances = 1 - np.load(fullfilename)
        else:
            distances = 1 - np.load(fullfilename)

        return distances

    def dist_matrix(self, dataset):
        company_names = dataset['c'].unique()
        print('length of unique company name:',company_names.size)
        elements_list =  list(self.partitionlist(company_names, self.partitions))
        print(elements_list)
        self.dist_calculation(elements_list)
        distance_mat = self.read_distances(elements_list)                              
        return distance_mat


obj = Distance(partitions = 6)
matrix = obj.dist_matrix(dataset = df)

I am actually try to identify similarity between texts. Here i used a toy dataset but in my case i have a huge text data where i have to find the similarity. I could not fit the entire data in RAM and decided to divide the data and construct the distance matrix. But it is taking too much of time and wanted to run it faster by using multiprocessing. I tried few methods but i am going into a deadlock situation. Also, I found ray library but could not integrate it to this code(tried but failed to reduce the time). Is there any way of optimizing it in terms of processing.

have you considered taking your approach to the cloud? eventually we all reach limitations on our local machines. — gold_cy, Jun 18 '19 at 19:23
no not yet but wanted to see how fast it takes if i use multiprocessing methods — Vas, Jun 18 '19 at 19:25

Multiprocessing for partitions of data

0 Answers0