0

I want to calculate the overall proximity of two matrices. The matrices are very large. If we do a pairwise calculation aren't we just calculating distances individually of each number in each place? Would summing the values make sense? Would a point somewhere in between like KNN make sense here?

I have euclidean distance and cosine similarity but they are pair wise. I want to compare all the vectors at the same time.

def euclidean_distance_of_two_matrices(x, y):
    A = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
    print(A)
    B = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
    print(B)
    print("similarity pairwise here")
    print(scipy.spatial.distance_matrix(A, B))

Returns:
[[-0.66103    0.27502   -0.4007    ... -1.2427     0.2829    -0.79741  ]
 [-0.27628    0.13999    0.098519  ... -0.15686   -0.14187   -0.26488  ]
 [-0.11585   -0.05561    0.32372   ... -0.22155   -0.30258   -0.26258  ]
 [-0.68621   -0.21032    0.30084   ... -0.038338  -0.44363    0.17988  ]
 [ 0.067032  -0.10813    0.44981   ... -0.15073   -0.25662    0.08055  ]
 [-0.16147    0.040132   0.66291   ... -0.41689    0.0051422  0.6892   ]]
[[-0.66103    0.27502   -0.4007    ... -1.2427     0.2829    -0.79741  ]
 [-0.27628    0.13999    0.098519  ... -0.15686   -0.14187   -0.26488  ]
 [-0.11585   -0.05561    0.32372   ... -0.22155   -0.30258   -0.26258  ]
 [-0.68621   -0.21032    0.30084   ... -0.038338  -0.44363    0.17988  ]
 [ 0.067032  -0.10813    0.44981   ... -0.15073   -0.25662    0.08055  ]
 [-0.16147    0.040132   0.66291   ... -0.41689    0.0051422  0.6892   ]]
0.0
similarity pairwise here
[[ 0.          9.25525143  9.31467432 10.85744589  8.96540006 10.48973378]
 [ 9.25525143  0.          7.67091186  9.85484951  7.510602    7.80745963]
 [ 9.31467432  7.67091186  0.          9.47533533  7.48946392  8.78024467]
 [10.85744589  9.85484951  9.47533533  0.          9.18870369  9.19037631]
 [ 8.96540006  7.510602    7.48946392  9.18870369  0.          7.85130008]
 [10.48973378  7.80745963  8.78024467  9.19037631  7.85130008  0.        ]]

def cosine_similarity_distance_of_two_matrices(x, y):
    A = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
    print(A)
    B = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
    print(B)
    def sklearn_cosine():
        similarity = cosine_similarity(A, B)
        print("similarity pairwise here")
        print(similarity)
    sklearn_cosine()

Returns:
[[-0.66103    0.27502   -0.4007    ... -1.2427     0.2829    -0.79741  ]
 [-0.27628    0.13999    0.098519  ... -0.15686   -0.14187   -0.26488  ]
 [-0.11585   -0.05561    0.32372   ... -0.22155   -0.30258   -0.26258  ]
 [-0.68621   -0.21032    0.30084   ... -0.038338  -0.44363    0.17988  ]
 [ 0.067032  -0.10813    0.44981   ... -0.15073   -0.25662    0.08055  ]
 [-0.16147    0.040132   0.66291   ... -0.41689    0.0051422  0.6892   ]]
[[-0.66103    0.27502   -0.4007    ... -1.2427     0.2829    -0.79741  ]
 [-0.27628    0.13999    0.098519  ... -0.15686   -0.14187   -0.26488  ]
 [-0.11585   -0.05561    0.32372   ... -0.22155   -0.30258   -0.26258  ]
 [-0.68621   -0.21032    0.30084   ... -0.038338  -0.44363    0.17988  ]
 [ 0.067032  -0.10813    0.44981   ... -0.15073   -0.25662    0.08055  ]
 [-0.16147    0.040132   0.66291   ... -0.41689    0.0051422  0.6892   ]]
similarity pairwise here
[[ 0.99999964  0.2620098   0.20013984  0.0369565   0.16275422 -0.0098459 ]
 [ 0.2620098   1.0000001   0.38454026  0.11589497  0.32478386  0.36860135]
 [ 0.20013984  0.38454026  0.99999976  0.12082676  0.24987505  0.12946124]
 [ 0.0369565   0.11589497  0.12082676  0.9999998   0.0517699   0.18130727]
 [ 0.16275422  0.32478386  0.24987505  0.0517699   0.99999976  0.18552886]
 [-0.0098459   0.36860135  0.12946124  0.18130727  0.18552886  0.9999999 ]]

Stackaccount1
  • 139
  • 1
  • 12

1 Answers1

0
def euclidean_distance_of_two_matrices(x, y):
    A = np.concatenate([np.array(list (x['topics'].values())), np.array(list (x['tags'].values()))])
    print("Matrix 1")
    print(A)
    B = np.concatenate([np.array(list (y['topics'].values())), np.array(list (y['tags'].values()))])
    print("Matrix 2")
    print(B)
    print("The Euclidean Distance Is")
    dist_squared = np.sum(np.square(A - B))
    print(dist_squared)

Returns:
Matrix 1
[[-0.66103    0.27502   -0.4007    ... -1.2427     0.2829    -0.79741  ]
 [-0.27628    0.13999    0.098519  ... -0.15686   -0.14187   -0.26488  ]
 [-0.11585   -0.05561    0.32372   ... -0.22155   -0.30258   -0.26258  ]
 [-0.68621   -0.21032    0.30084   ... -0.038338  -0.44363    0.17988  ]
 [ 0.067032  -0.10813    0.44981   ... -0.15073   -0.25662    0.08055  ]
 [-0.16147    0.040132   0.66291   ... -0.41689    0.0051422  0.6892   ]]
Matrix 2
[[-0.68621   -0.21032    0.30084   ... -0.038338  -0.44363    0.17988  ]
 [-0.36459    0.11409    0.060372  ... -0.29483    0.11534   -0.25252  ]
 [-0.68088   -0.3137     0.27078   ...  0.24844   -0.043204  -0.10115  ]
 [-0.68621   -0.21032    0.30084   ... -0.038338  -0.44363    0.17988  ]
 [ 0.19509    0.23105   -0.25786   ... -0.19888   -0.060843  -0.081697 ]
 [-0.16147    0.040132   0.66291   ... -0.41689    0.0051422  0.6892   ]]
The Euclidean Distance Is
212.88983

Not sure what scipy.spatial.distance_matrix(A, B) pairwise difference is between what I used but this returned the correct result for me.

Stackaccount1
  • 139
  • 1
  • 12
  • The real solution I was searching for I believe is adding the vectors together like so: https://stackoverflow.com/q/45651644/7314286 and https://stackoverflow.com/q/29760935/7314286 – Stackaccount1 Feb 24 '21 at 05:58