I have a problem where a need to calculate cosine similarities
between a numpy array of shape (1, 300) and a matrix of shape (5000000, 300). I have tried multiple different flavors of codes and now I am wondering if there is a way to reduce the run time substantially :
Version 1 : I divide my big matrix into 5 smaller matrix of size 1Mil each:
from scipy import spatial
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def cos_matrix_multiplication(vector,matrix_1):
v = vector.reshape(1, -1)
scores1=spatial.distance.cdist(matrix_1, v, 'cosine')
return((scores1[:1]))
pool = ThreadPoolExecutor(8)
URLS=[mat_small1,mat_small2,mat_small3,mat_small4,mat_small5]
neighbors=[]
with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
# Start the load operations and mark each future with its URL
future_to_url = {executor.submit(cos_matrix_multiplication,vec,mat_col): mat_col for mat_col in URLS}
for future in concurrent.futures.as_completed(future_to_url):
url = future_to_url[future]
data = future.result()
neighbors.append(data)
Runtime : 2.48secs
Version 2: Using Numba jit : inspired by this SO answer
@numba.jit('void(f4, f4)',nogil=True)
def cosine_sim(A,B):
scores = np.zeros(A.shape[0])
for i in range(A.shape[0]):
v = A[i]
m = B.shape[1]
udotv = 0
u_norm = 0
v_norm = 0
for j in range(m):
udotv += B[0][j] * v[j]
u_norm += B[0][j] * B[0][j]
v_norm += v[j] * v[j]
ratio = udotv/((u_norm*v_norm)**0.5)
scores[i] = ratio
i += 1
return scores
cosine_sim(matrix,vec)
Runtime 2.34 secs
Version 3: Using Cuda jit (can't really get to work in a reproducible manner each time)
@cuda.jit
def cosine_sim(A,B,C):
#scores = np.zeros(A.shape[0])
for i in range(A.shape[0]):
v = A[i]
m = B.shape[1]
udotv = 0
u_norm = 0
v_norm = 0
for j in range(m):
udotv += B[0][j] * v[j]
u_norm += B[0][j] * B[0][j]
v_norm += v[j] * v[j]
u_norm = math.sqrt(u_norm)
v_norm = math.sqrt(v_norm)
if (u_norm == 0) or (v_norm == 0):
ratio = 1.0
else:
ratio = udotv / (u_norm * v_norm)
C[i,1] = ratio
i += 1
matrix = mat_small1
A_global_mem = cuda.to_device(matrix)
B_global_mem = cuda.to_device(vec)
C_global_mem = cuda.device_array((matrix.shape[0], 1))
threadsperblock = (16, 16)
blockspergrid_x = int(math.ceil(A_global_mem.shape[0] / threadsperblock[0]))
blockspergrid_y = int(math.ceil(B_global_mem.shape[1] / threadsperblock[1]))
blockspergrid = (blockspergrid_x, blockspergrid_y)
cosine_sim[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
C = C_global_mem.copy_to_host()
results in :
CudaAPIError: [702] Call to cuMemcpyDtoH results in CUDA_ERROR_LAUNCH_TIMEOUT
The matrix is dense, and the My GPU is 8gb ram, and total size of the matrix is about 4.7gb. Can a GPU expedite this at all?