I want to calculate the overall proximity of two matrices. The matrices are very large. If we do a pairwise calculation aren't we just calculating distances individually of each number in each place? Would summing the values make sense? Would a point somewhere in between like KNN make sense here?
I have euclidean distance and cosine similarity but they are pair wise. I want to compare all the vectors at the same time.
def euclidean_distance_of_two_matrices(x, y):
A = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
print(A)
B = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
print(B)
print("similarity pairwise here")
print(scipy.spatial.distance_matrix(A, B))
Returns:
[[-0.66103 0.27502 -0.4007 ... -1.2427 0.2829 -0.79741 ]
[-0.27628 0.13999 0.098519 ... -0.15686 -0.14187 -0.26488 ]
[-0.11585 -0.05561 0.32372 ... -0.22155 -0.30258 -0.26258 ]
[-0.68621 -0.21032 0.30084 ... -0.038338 -0.44363 0.17988 ]
[ 0.067032 -0.10813 0.44981 ... -0.15073 -0.25662 0.08055 ]
[-0.16147 0.040132 0.66291 ... -0.41689 0.0051422 0.6892 ]]
[[-0.66103 0.27502 -0.4007 ... -1.2427 0.2829 -0.79741 ]
[-0.27628 0.13999 0.098519 ... -0.15686 -0.14187 -0.26488 ]
[-0.11585 -0.05561 0.32372 ... -0.22155 -0.30258 -0.26258 ]
[-0.68621 -0.21032 0.30084 ... -0.038338 -0.44363 0.17988 ]
[ 0.067032 -0.10813 0.44981 ... -0.15073 -0.25662 0.08055 ]
[-0.16147 0.040132 0.66291 ... -0.41689 0.0051422 0.6892 ]]
0.0
similarity pairwise here
[[ 0. 9.25525143 9.31467432 10.85744589 8.96540006 10.48973378]
[ 9.25525143 0. 7.67091186 9.85484951 7.510602 7.80745963]
[ 9.31467432 7.67091186 0. 9.47533533 7.48946392 8.78024467]
[10.85744589 9.85484951 9.47533533 0. 9.18870369 9.19037631]
[ 8.96540006 7.510602 7.48946392 9.18870369 0. 7.85130008]
[10.48973378 7.80745963 8.78024467 9.19037631 7.85130008 0. ]]
def cosine_similarity_distance_of_two_matrices(x, y):
A = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
print(A)
B = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
print(B)
def sklearn_cosine():
similarity = cosine_similarity(A, B)
print("similarity pairwise here")
print(similarity)
sklearn_cosine()
Returns:
[[-0.66103 0.27502 -0.4007 ... -1.2427 0.2829 -0.79741 ]
[-0.27628 0.13999 0.098519 ... -0.15686 -0.14187 -0.26488 ]
[-0.11585 -0.05561 0.32372 ... -0.22155 -0.30258 -0.26258 ]
[-0.68621 -0.21032 0.30084 ... -0.038338 -0.44363 0.17988 ]
[ 0.067032 -0.10813 0.44981 ... -0.15073 -0.25662 0.08055 ]
[-0.16147 0.040132 0.66291 ... -0.41689 0.0051422 0.6892 ]]
[[-0.66103 0.27502 -0.4007 ... -1.2427 0.2829 -0.79741 ]
[-0.27628 0.13999 0.098519 ... -0.15686 -0.14187 -0.26488 ]
[-0.11585 -0.05561 0.32372 ... -0.22155 -0.30258 -0.26258 ]
[-0.68621 -0.21032 0.30084 ... -0.038338 -0.44363 0.17988 ]
[ 0.067032 -0.10813 0.44981 ... -0.15073 -0.25662 0.08055 ]
[-0.16147 0.040132 0.66291 ... -0.41689 0.0051422 0.6892 ]]
similarity pairwise here
[[ 0.99999964 0.2620098 0.20013984 0.0369565 0.16275422 -0.0098459 ]
[ 0.2620098 1.0000001 0.38454026 0.11589497 0.32478386 0.36860135]
[ 0.20013984 0.38454026 0.99999976 0.12082676 0.24987505 0.12946124]
[ 0.0369565 0.11589497 0.12082676 0.9999998 0.0517699 0.18130727]
[ 0.16275422 0.32478386 0.24987505 0.0517699 0.99999976 0.18552886]
[-0.0098459 0.36860135 0.12946124 0.18130727 0.18552886 0.9999999 ]]