I have two methods for calculating the unit vector of an array, both of which handle sparse arrays. One of them is very much a 'manual' computation, whereas the other is more 'formal' (from the gensim.matutils source code).
This is the manual method:
def manual_unitvec(vec):
vec = vec.tocsr()
if sparse.issparse(vec):
vec_sum_of_squares = vec.multiply(vec)
unit = 1. / np.sqrt(vec_sum_of_squares.sum())
return vec.multiply(unit)
elif not sparse.issparse(vec):
sum_vec_squared = np.sum(vec ** 2)
vec /= np.sqrt(sum_vec_squared)
return vec
This is the modified gensim method, where the method explicitly computing the unit vector is unitvec
:
import numpy as np
from scipy import sparse
from gensim.matutils import ret_normalized_vec, blas
import scipy.sparse
blas_nrm2 = blas('nrm2', np.array([], dtype=float))
blas_scal = blas('scal', np.array([], dtype=float))
def unitvec(vec, norm='l2'):
"""Scale a vector to unit length.
Parameters
----------
vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
Input vector in any format
norm : {'l1', 'l2'}, optional
Normalization that will be used.
Returns
-------
{numpy.ndarray, scipy.sparse, list of (int, float)}
Normalized vector in same format as `vec`.
Notes
-----
Zero-vector will be unchanged.
"""
if norm not in ('l1', 'l2'):
raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)
if scipy.sparse.issparse(vec):
print("INSIDE SPARSE HANDLING")
vec = vec.tocsr()
if norm == 'l1':
veclen = np.sum(np.abs(vec.data))
if norm == 'l2':
veclen = np.sqrt(np.sum(vec.data ** 2))
if veclen > 0.0:
if np.issubdtype(vec.dtype, np.int) == True:
vec = vec.astype(np.float)
return vec / veclen
else:
vec /= veclen
return vec.astype(vec.dtype)
else:
return vec
if isinstance(vec, np.ndarray):
print("INSIDE NORMAL VEC HANDLING")
vec = np.asarray(vec, dtype=vec.dtype)
if norm == 'l1':
veclen = np.sum(np.abs(vec))
if norm == 'l2':
veclen = blas_nrm2(vec)
if veclen > 0.0:
if np.issubdtype(vec.dtype, np.int) == True:
vec = vec.astype(np.float)
return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
else:
return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
else:
return vec
try:
first = next(iter(vec)) # is there at least one element?
except StopIteration:
return vec
if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format
print("INSIDE GENSIM SPARSE FORMAT HANDLING")
if norm == 'l1':
length = float(sum(abs(val) for _, val in vec))
if norm == 'l2':
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
assert length > 0.0, "sparse documents must not contain any explicit zero entries"
return ret_normalized_vec(vec, length)
else:
raise ValueError("unknown input type")
When running tests, I want to check that the output from each of these methods is the same. Below is a snippet of example code:
vec = sparse.csr_matrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.float32)
output1 = manual_unitvec(vec)
output2 = unitvec(vec)
print(output1)
print(' ')
print(output2)
print(np.array_equal(output1, output2))
print(type(output1) == type(output2))
So what I want to check is assertTrue(output1, output2). You can't do this because the truth value of arrays is ambiguous, so I use assertTrue(np.array_equal(output1, output2))
.
Now the issue is that array_equal does not view output1 and output2 as being the same, even though I can see from printing them out that they are identical.
Running all of the code above gives the following output:
MacBook-Air:matutils.unitvec Olly$ python try.py
INSIDE SPARSE HANDLING
try.py:80: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
if np.issubdtype(vec.dtype, np.int) == True:
(0, 0) 0.059234887
(0, 1) 0.118469775
(0, 2) 0.17770466
(1, 0) 0.23693955
(1, 1) 0.29617444
(1, 2) 0.35540932
(2, 0) 0.4146442
(2, 1) 0.4738791
(2, 2) 0.53311396
(0, 0) 0.059234887
(0, 1) 0.118469775
(0, 2) 0.17770466
(1, 0) 0.23693955
(1, 1) 0.29617444
(1, 2) 0.35540932
(2, 0) 0.4146442
(2, 1) 0.4738791
(2, 2) 0.53311396
/Users/Olly/anaconda2/lib/python2.7/site-packages/scipy/sparse/compressed.py:226: SparseEfficiencyWarning: Comparing sparse matrices using == is inefficient, try using != instead.
" != instead.", SparseEfficiencyWarning)
False
True
I thought that the issue might have come from the sparse array types, but as you can see, they are equal. You can also visually see that the elements are exactly the same.
So why is array_equal returning false? How can I change it?