I am trying to solve a lot of linear equations as fast as possible. To find out the fastest way I benchmarked NumPy and PyTorch, each on the CPU and on my GeForce 1080 GPU (using Numba for NumPy). The results really confused me.
This is the code I used with Python 3.8:
import timeit
import torch
import numpy
from numba import njit
def solve_numpy_cpu(dim: int = 5):
a = numpy.random.rand(dim, dim)
b = numpy.random.rand(dim)
for _ in range(1000):
numpy.linalg.solve(a, b)
def solve_numpy_njit_a(dim: int = 5):
njit(solve_numpy_cpu, dim=dim)
@njit
def solve_numpy_njit_b(dim: int = 5):
a = numpy.random.rand(dim, dim)
b = numpy.random.rand(dim)
for _ in range(1000):
numpy.linalg.solve(a, b)
def solve_torch_cpu(dim: int = 5):
a = torch.rand(dim, dim)
b = torch.rand(dim, 1)
for _ in range(1000):
torch.solve(b, a)
def solve_torch_gpu(dim: int = 5):
torch.set_default_tensor_type("torch.cuda.FloatTensor")
solve_torch_cpu(dim=dim)
def main():
for f in (solve_numpy_cpu, solve_torch_cpu, solve_torch_gpu, solve_numpy_njit_a, solve_numpy_njit_b):
time = timeit.timeit(f, number=1)
print(f"{f.__name__:<20s}: {time:f}")
if __name__ == "__main__":
main()
And these are the results:
solve_numpy_cpu : 0.007275
solve_torch_cpu : 0.012244
solve_torch_gpu : 5.239126
solve_numpy_njit_a : 0.000158
solve_numpy_njit_b : 1.273660
The slowest is CUDA accelerated PyTorch. I verified that PyTorch is using my GPU with
import torch
torch.cuda.is_available()
torch.cuda.get_device_name(0)
returning
True
'GeForce GTX 1080'
I can get behind that, on the CPU, PyTorch is slower than NumPy. What I cannot understand is why PyTorch on the GPU is so much slower. Not that important but actually even more confusing is that Numba's njit
decorator makes performance orders of magnitude slower, until you don't use the @ decorator syntax anymore.
Is it my setup? Occasionally I get a weird message about the windows page / swap file not being big enough. In case I've taken a completely obscure path to solving linear equations on the GPU, I'd be happy to be directed into another direction.
Edit
So, I focussed on Numba and changed my benchmarking a bit. As suggested by @max9111 I rewrote the functions to receive input and produce output because, in the end, that's what anyone would want to use them for. Now, I also perform a first compile run for the Numba accelerated function so the subsequent timing is fairer. Finally, I checked the performance against matrix size and plotted the results.
TL/DR: Up to matrix sizes of 500x500, Numba acceleration doesn't really make a difference for numpy.linalg.solve
.
Here is the code:
import time
from typing import Tuple
import numpy
from matplotlib import pyplot
from numba import jit
@jit(nopython=True)
def solve_numpy_njit(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
a = numpy.random.random((dim, dim))
b = numpy.random.random(dim)
return a, b
def main():
a, b = get_data(10)
# compile numba function
p = solve_numpy_njit(a, b)
matrix_size = [(x + 1) * 10 for x in range(50)]
non_accelerated = []
accelerated = []
results = non_accelerated, accelerated
for j, each_matrix_size in enumerate(matrix_size):
for m, f in enumerate((solve_numpy, solve_numpy_njit)):
average_time = -1.
for k in range(5):
time_start = time.time()
for i in range(100):
a, b = get_data(each_matrix_size)
p = f(a, b)
d_t = time.time() - time_start
print(f"{each_matrix_size:d} {f.__name__:<30s}: {d_t:f}")
average_time = (average_time * k + d_t) / (k + 1)
results[m].append(average_time)
pyplot.plot(matrix_size, non_accelerated, label="not numba")
pyplot.plot(matrix_size, accelerated, label="numba")
pyplot.legend()
pyplot.show()
if __name__ == "__main__":
main()
And these are the results (runtime against matrix edge length):
Edit 2
Seeing that Numba doesn't make much of a difference in my case, I came back to benchmarking PyTorch. And indeed, it appears to be roughly 4x faster than Numpy without even using a CUDA device.
Here is the code I used:
import time
from typing import Tuple
import numpy
import torch
from matplotlib import pyplot
def solve_numpy(a: numpy.ndarray, b: numpy.ndarray) -> numpy.ndarray:
parameters = numpy.linalg.solve(a, b)
return parameters
def get_data(dim: int) -> Tuple[numpy.ndarray, numpy.ndarray]:
a = numpy.random.random((dim, dim))
b = numpy.random.random(dim)
return a, b
def get_data_torch(dim: int) -> Tuple[torch.tensor, torch.tensor]:
a = torch.rand(dim, dim)
b = torch.rand(dim, 1)
return a, b
def solve_torch(a: torch.tensor, b: torch.tensor) -> torch.tensor:
parameters, _ = torch.solve(b, a)
return parameters
def experiment_numpy(matrix_size: int, repetitions: int = 100):
for i in range(repetitions):
a, b = get_data(matrix_size)
p = solve_numpy(a, b)
def experiment_pytorch(matrix_size: int, repetitions: int = 100):
for i in range(repetitions):
a, b = get_data_torch(matrix_size)
p = solve_torch(a, b)
def main():
matrix_size = [x for x in range(5, 505, 5)]
experiments = experiment_numpy, experiment_pytorch
results = tuple([] for _ in experiments)
for i, each_experiment in enumerate(experiments):
for j, each_matrix_size in enumerate(matrix_size):
time_start = time.time()
each_experiment(each_matrix_size, repetitions=100)
d_t = time.time() - time_start
print(f"{each_matrix_size:d} {each_experiment.__name__:<30s}: {d_t:f}")
results[i].append(d_t)
for each_experiment, each_result in zip(experiments, results):
pyplot.plot(matrix_size, each_result, label=each_experiment.__name__)
pyplot.legend()
pyplot.show()
if __name__ == "__main__":
main()
And here's the result (runtime against matrix edge length):
So for now, I'll be sticking with torch.solve
. However, the original question remains:
How can I exploit my GPU to solve linear equations even faster?