reducing loops with numpy

Question

we are trying to implement the given Modified Gram Schmidt algorithm: instructions here

We first tried to implement lines 5-7 in the next way:

for j in range(i+1, N):
    R[i, j] = np.matmul(Q[:, i].transpose(), U[:, j])
    u = U[:, j] - R[i, j] * Q[:, i]
    U[:, j] = u

In order to reduce running time we tried to replace the loop with matrix operations like this:

# we changed the inner loop to matrix operations in order to improve running time
R[i, i + 1:] = np.matmul(Q[:, i] , U[:, i + 1:])
U[:, i + 1:] = U[:, i + 1:] - R[i, i + 1:] * np.transpose(np.tile(Q[:, i], (N - i - 1, 1)))

The results are not the same, but very similar. Is there a problem with our second trial?

Thanks!

Edit: The full functions are:

def gram_schmidt2(A):
    """
    decomposes a matrix A ∈ R into a product A = QR of an
    orthogonal matrix Q (i.e. QTQ = I) and an upper triangular matrix R (i.e. entries below
    the main diagonal are zero)

    :return: Q,R
    """
    N = np.shape(A)[0]
    U = A.copy()
    Q = np.zeros((N, N), dtype=np.float64)
    R = np.zeros((N, N), dtype=np.float64)
    for i in range(N):
        R[i, i] = np.linalg.norm(U[:, i])
        # Handling devision by zero by exiting the program as was advised in the forum
        if R[i, i] == 0:
            zero_devision_error(gram_schmidt._name_)
        Q[:, i] = np.divide(U[:, i], R[i, i])
        # we changed the inner loop to matrix operatins in oreder to improve running time
        for j in range(i+1, N):
            R[i, j] = np.matmul(Q[:, i].transpose(), U[:, j])
            u = U[:, j] - R[i, j] * Q[:, i]
            U[:, j] = u
    return Q, R

and:

def gram_schmidt1(A):
    """
    decomposes a matrix A ∈ R into a product A = QR of an
    orthogonal matrix Q (i.e. QTQ = I) and an upper triangular matrix R (i.e. entries below
    the main diagonal are zero)

    :return: Q,R
    """
    N = np.shape(A)[0]
    U = A.copy()
    Q = np.zeros((N, N), dtype=np.float64)
    R = np.zeros((N, N), dtype=np.float64)
    for i in range(N):
        R[i, i] = np.linalg.norm(U[:, i])
        # Handling devision by zero by exiting the program as was advised in the forum
        if R[i, i] == 0:
            zero_devision_error(gram_schmidt._name_)
        Q[:, i] = np.divide(U[:, i], R[i, i])
        # we changed the inner loop to matrix operatins in oreder to improve running time
        R[i, i + 1:] = np.matmul(Q[:, i] , U[:, i + 1:])
        U[:, i + 1:] = U[:, i + 1:] - R[i, i + 1:] * np.transpose(np.tile(Q[:, i], (N - i - 1, 1)))
    return Q, R

When we run the functions on the matrix:

[[ 1.00000000e+00 -1.98592571e-02 -1.00365698e-04 -1.45204974e-03
  -9.95711793e-01 -1.77405377e-04 -7.68526195e-03]
 [-1.98592571e-02  1.00000000e+00 -1.77809186e-02 -1.55937174e-01
  -9.80881385e-03 -2.05317715e-02 -2.01456899e-01]
 [-1.00365698e-04 -1.77809186e-02  1.00000000e+00 -1.87979660e-01
  -5.12368040e-05 -8.35323206e-01 -4.59007949e-05]
 [-1.45204974e-03 -1.55937174e-01 -1.87979660e-01  1.00000000e+00
  -8.69848133e-04 -3.64095785e-01 -5.55408776e-04]
 [-9.95711793e-01 -9.80881385e-03 -5.12368040e-05 -8.69848133e-04
   1.00000000e+00 -9.54867422e-05 -5.92716161e-03]
 [-1.77405377e-04 -2.05317715e-02 -8.35323206e-01 -3.64095785e-01
  -9.54867422e-05  1.00000000e+00 -5.55505343e-05]
 [-7.68526195e-03 -2.01456899e-01 -4.59007949e-05 -5.55408776e-04
  -5.92716161e-03 -5.55505343e-05  1.00000000e+00]]

we get different these outputs:

for gram shmidt 1:

Q:

[[ 7.34036501e-01 -8.55006295e-04 -8.15634583e-03 -9.24967764e-02
  -4.91879501e-02 -4.90769704e-01  1.58268518e-01]
 [-2.78569770e-04  7.14001661e-01 -2.70586659e-03 -2.70735367e-02
   5.78840577e-01  2.37376069e-01  1.97835647e-02]
 [-2.48309244e-03 -2.34709092e-03  7.38351181e-01  2.63187853e-01
  -3.35473487e-01  3.38823696e-01  3.36320600e-01]
 [-4.27658449e-03 -2.12584453e-03 -6.70730760e-01  3.82666405e-01
  -3.44451231e-01  3.46085878e-01 -7.71559024e-01]
 [-6.53970073e-04 -7.00117873e-01 -2.68125144e-03 -2.31536583e-02
   5.94568750e-01  2.38329853e-01 -2.76969906e-01]
 [-9.26674350e-02 -5.07961588e-03 -6.97972068e-02 -8.79879575e-01
  -2.78679804e-01  2.78781202e-01  0.00000000e+00]
 [-6.72739327e-01  1.73894101e-04  2.25707383e-03  1.69052581e-02
  -1.26723666e-02 -5.77668322e-01 -4.35238424e-01]]

R:

[[ 1.36233007e+00  1.11436069e-03  1.04418015e-02  1.27072186e-02
   1.10993692e-03 -7.82681536e-02 -1.33081669e+00]
 [ 0.00000000e+00  1.40055740e+00  5.29057231e-04  1.44628716e-03
  -1.40014587e+00  3.57535802e-04  2.25417515e-03]
 [ 0.00000000e+00  0.00000000e+00  1.35440586e+00 -1.33059602e+00
   6.67148806e-04 -3.51561140e-02  2.23809829e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.81147599e-01
   1.33951520e-02 -9.55057795e-01  2.36910667e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   3.37143743e-02 -1.97436093e-01  7.90539705e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  3.40545951e-01 -1.75971454e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  3.50740324e-16]]

for gram shmidt 2:

Q:

    [[ 7.34036501e-01 -8.55006295e-04 -8.15634583e-03 -9.24967764e-02
  -4.91879501e-02 -4.90769704e-01  4.55677949e-01]
 [-2.78569770e-04  7.14001661e-01 -2.70586659e-03 -2.70735367e-02
   5.78840577e-01  2.37376069e-01 -1.89865812e-01]
 [-2.48309244e-03 -2.34709092e-03  7.38351181e-01  2.63187853e-01
  -3.35473487e-01  3.38823696e-01  9.49329061e-02]
 [-4.27658449e-03 -2.12584453e-03 -6.70730760e-01  3.82666405e-01
  -3.44451231e-01  3.46085878e-01 -4.36691368e-01]
 [-6.53970073e-04 -7.00117873e-01 -2.68125144e-03 -2.31536583e-02
   5.94568750e-01  2.38329853e-01 -1.13919487e-01]
 [-9.26674350e-02 -5.07961588e-03 -6.97972068e-02 -8.79879575e-01
  -2.78679804e-01  2.78781202e-01 -1.51892650e-01]
 [-6.72739327e-01  1.73894101e-04  2.25707383e-03  1.69052581e-02
  -1.26723666e-02 -5.77668322e-01 -7.21490087e-01]]

R:

[[ 1.36233007e+00  1.11436069e-03  1.04418015e-02  1.27072186e-02
   1.10993692e-03 -7.82681536e-02 -1.33081669e+00]
 [ 0.00000000e+00  1.40055740e+00  5.29057231e-04  1.44628716e-03
  -1.40014587e+00  3.57535802e-04  2.25417515e-03]
 [ 0.00000000e+00  0.00000000e+00  1.35440586e+00 -1.33059602e+00
   6.67148806e-04 -3.51561140e-02  2.23809829e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.81147599e-01
   1.33951520e-02 -9.55057795e-01  2.36910667e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   3.37143743e-02 -1.97436093e-01  7.90539705e-02]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  3.40545951e-01 -1.75971454e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  3.65463051e-16]]

Can you provide full code if it is not too large? Also some real data examples. We need some reproducible code and data to try our solutions and help you. — Arty, Apr 24 '21 at 12:16
Are you sure that your first code works? Your function `gram_schmidt2()` throws exception on your test data. — Arty, Apr 24 '21 at 12:46
It says that `U[:, j]` is out of bounds, because `j` ranges up to `N` and `N` is first dimension of input array, not second. `U` is equal in dimensions to `A`, meaning that `j` should be in first index like `U[j, :]`, not `U[:, j]`. — Arty, Apr 24 '21 at 12:47
Probably `A` should be square matrix in order for your `gram_schmidt2()` to work. — Arty, Apr 24 '21 at 12:52
you are totally right! we updated it with the right input now,which has the problem — ronihm, Apr 24 '21 at 13:36
Again you gave wrong example, on given input your function `gram_schimd2()` produces two matrices, not one as a result. And both of these two matrices are unequal to your `expected` matrix. Two matrices that I got [are here](https://cutt.ly/1v1scez). Can you run yourself on given data and provide us with correct input and expected output? — Arty, Apr 24 '21 at 13:53
Yes. I only provided the differences.I added the output of each algorithm and the difference is in the Q matrices — ronihm, Apr 24 '21 at 16:51

score 1 · Answer 1 · answered Apr 25 '21 at 02:59

The following piece of code does what you want, in a more efficient manner:

        Q_i = Q[:, i].reshape(1,-1)
        R[i,i+1:] = np.matmul(Q_i , U[:,i+1:])
        U[:,i+1:] -=  np.multiply(R[i,i+1:] , Q_i.T)

First line is just a convenience, to make code more readable.

Everything is the same to your original proposal, except of last line. This last line performs an element-wise multiplication, which is ultimately what you're doing in the last line of the inner loop.

About the differences in results:

Your code is ok, both do the same. As you're dealing with floating point numbers, you shouldn't test as A == B. Instead, I recommend you check how different both arrays are.

Particularly, running

Q1,R1 = gram_schmidt2(A)
Q2,R2 = gram_schmidt1(A)

(Q1 - Q2).mean()
(R1 - R2).mean()

gives, respectively:

-5.4997372770547595e-09 and -5.2465803662044656e-18

Which are quite close to 0 already. 1e-18 is below the error of dtype np.float64, so you good there.

You can check this if you run the difference 3*0.1 - 0.3 (about 1e-17)

Error for matrix Q is larger as it comes from a division between floats, which increases error if the matrix elements are small in magnitude (which is sometimes the case here).

About runtime:

I get similar run times when running both versions of your code:(243 µs ± 25.5 µs using loop, 241 µs ± 6.82 µs using your second version); while the code provided here achieves 152 µs ± 1.49 µs.

"Error for matrix Q is larger as it comes from a division between floats, which increases error if the matrix elements are small in magnitude (which is sometimes the case here)." what do you mean? we divide the same amount of times in both — ronihm, Apr 25 '21 at 08:58
Uh, sorry, didn't realize that, so that argument no longer applies. But still, the (mean) difference is quite small. You can better check your answer by multiplying Q*R, and comparing to A. That should give you a better sense of whether the answers you get are wright. — Andrés Marulanda, Apr 26 '21 at 18:36
The deffinition of Q_i is just a convenience, as it'll be used in the next two lines. .reshape(1,-1) is necessary because we need to convert this slice (Q[:,i]) into matrix form, so it can be transposed, and broadcasted in the way we want by numpy. — Andrés Marulanda, Apr 26 '21 at 18:38

score 0 · Answer 2 · answered Apr 24 '21 at 17:17

I can suggest you using Numba, it is a great speed optimizer, it can boost many Python programs 50-200x times by JIT-compiling it into C++ and machine code.

To install numba just do one time python -m pip install numba.

Below is the code adopting your algorithm to numba, mostly it is just a @numba.njit decorator before first line of function.

In numba code you can just write regular Python loops and any mathematical computation, even without using Numpy and your final code will be blazingly fast, most of times even faster then any Numpy code.

I used your gram_schmidt2() function as a basis and only replace np.multiply() with np.dot() because Numba seems that implemented only np.dot() functionality.

Try it online!

import numpy as np, numba

@numba.njit(cache = True, fastmath = True, parallel = True)
def gram_schmidt2(A):
    """
    decomposes a matrix A ∈ R into a product A = QR of an
    orthogonal matrix Q (i.e. QTQ = I) and an upper triangular matrix R (i.e. entries below
    the main diagonal are zero)

    :return: Q,R
    """
    N = np.shape(A)[0]
    U = A.copy()
    Q = np.zeros((N, N), dtype=np.float64)
    R = np.zeros((N, N), dtype=np.float64)
    for i in range(N):
        R[i, i] = np.linalg.norm(U[:, i])
        # Handling devision by zero by exiting the program as was advised in the forum
        if R[i, i] == 0:
            assert False #zero_devision_error(gram_schmidt._name_)
        Q[:, i] = np.divide(U[:, i], R[i, i])
        # we changed the inner loop to matrix operatins in oreder to improve running time
        for j in range(i+1, N):
            R[i, j] = np.dot(Q[:, i].transpose(), U[:, j])
            u = U[:, j] - R[i, j] * Q[:, i]
            U[:, j] = u
    return Q, R
    
a = np.array(
    [[ 1.00000000e+00, -1.98592571e-02, -1.00365698e-04, -1.45204974e-03,
      -9.95711793e-01, -1.77405377e-04, -7.68526195e-03],
     [-1.98592571e-02,  1.00000000e+00, -1.77809186e-02, -1.55937174e-01,
      -9.80881385e-03, -2.05317715e-02, -2.01456899e-01],
     [-1.00365698e-04, -1.77809186e-02,  1.00000000e+00, -1.87979660e-01,
      -5.12368040e-05, -8.35323206e-01, -4.59007949e-05],
     [-1.45204974e-03, -1.55937174e-01, -1.87979660e-01,  1.00000000e+00,
      -8.69848133e-04, -3.64095785e-01, -5.55408776e-04],
     [-9.95711793e-01, -9.80881385e-03, -5.12368040e-05, -8.69848133e-04,
       1.00000000e+00, -9.54867422e-05, -5.92716161e-03],
     [-1.77405377e-04, -2.05317715e-02, -8.35323206e-01, -3.64095785e-01,
      -9.54867422e-05,  1.00000000e+00, -5.55505343e-05],
     [-7.68526195e-03, -2.01456899e-01, -4.59007949e-05, -5.55408776e-04,
      -5.92716161e-03, -5.55505343e-05,  1.00000000e+00]]
, dtype = np.float64)

print(gram_schmidt2(a))

Output:

(array([[ 7.08543467e-01, -5.53704898e-03, -2.70026740e-04,
        -3.47742384e-03,  1.84840892e-01, -5.24814365e-01,
        -4.33966083e-01],
       [-1.40711469e-02,  9.68398634e-01, -2.12833250e-02,
         1.19174521e-01, -1.98433167e-01, -3.04695775e-02,
        -8.39439437e-02],
       [-7.11134597e-05, -1.72252300e-02,  7.59699130e-01,
        -1.47406821e-01, -1.01157914e-01,  3.77137817e-01,
        -4.98362473e-01],
       [-1.02884036e-03, -1.51071666e-01, -1.41567550e-01,
         9.02766638e-01, -8.55711320e-02,  2.12039165e-01,
        -2.99775521e-01],
       [-7.05505086e-01, -2.31427937e-02,  3.84334272e-04,
        -6.68149305e-03,  1.96907249e-01, -5.24473268e-01,
        -4.33402818e-01],
       [-1.25699421e-04, -1.98909561e-02, -6.34318769e-01,
        -3.82156774e-01, -9.76029595e-02,  4.04531367e-01,
        -5.27283410e-01],
       [-5.44534215e-03, -1.95250685e-01,  1.53606576e-03,
        -5.45941927e-02, -9.27687435e-01, -3.12618155e-01,
        -2.30333938e-02]]),
array([[ 1.41134602e+00, -1.99608442e-02,  4.42769473e-04,
         8.12375351e-04, -1.41083897e+00,  5.39174765e-04,
        -3.87373035e-03],
       [ 0.00000000e+00,  1.03234256e+00,  1.05802339e-02,
        -2.91464191e-01, -2.58368570e-02,  2.96333339e-02,
        -3.90075744e-01],
       [ 0.00000000e+00,  0.00000000e+00,  1.31655051e+00,
        -5.01046784e-02,  9.97649491e-04, -1.21693202e+00,
         5.90252943e-03],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.05107524e+00, -4.80557952e-03, -5.90160540e-01,
        -7.90098043e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  2.03928769e-02,  2.21268065e-02,
        -8.90241765e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  1.30829767e-02,
        -2.99495426e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         9.31764881e-10]]))

reducing loops with numpy

2 Answers2

About the differences in results:

About runtime: