I'm running a very simple MKL BLAS matrix-matrix and matrix-vector multiplication on a computer with two AMD EPYC 7443 24-Core Processors and 1007GB RAM.
The code, compiling line and test results are given at the end of this post.
BLAS is apparently not multithreading the mat-vec operation, but only the mat-mat as you can see below.
How can I make the mat-vec operation multithreaded? What am I doing wrong?
Here's the code:
program main
use blas95
implicit none
integer, parameter :: lp = kind(DBLE(1.0))
integer :: m, n, i
complex(kind=lp), dimension(:), allocatable :: x, y
complex(kind=lp), dimension(:,:), allocatable :: A, B, C
m=2**12
n=2**12
allocate(A(m,n))
allocate(B(m,n),C(m,n))
allocate(x(n),y(m))
do i=0,5
call mkl_set_num_threads_local(2**i)
call mkl_set_dynamic(0)
call gemm(A,B,C)
end do
do i=0,5
call mkl_set_num_threads_local(2**i)
call mkl_set_dynamic(0)
call gemv(A,x,y)
end do
end program main
Here's my compile line:
gfortran -Ofast -I$MKLROOT/include -I$BLASROOT/include/intel64/lp64 main.F90 -L$MKLROOT/lib/intel64 -o main -lgomp -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core $BLASROOT/lib/intel64/libmkl_blas95_lp64.a
Here's the output:
MKL_VERBOSE oneMKL 2022.0 Product build 20211112 for Intel(R) 64 architecture Intel(R) Architecture processors, Lnx 1.79GHz lp64 gnu_thread
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 10.94s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:1
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 5.90s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:2
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 3.76s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:4
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 1.59s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:8
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 925.07ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:16
MKL_VERBOSE ZGEMM(N,N,4096,4096,4096,0x7fff21099cf0,0x154a1f17b010,4096,0x154a0f17a010,4096,0x7fff21099ce0,0x1549ff179010,4096) 606.32ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:32
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 12.23ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:1
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.68ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:2
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.66ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:4
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.62ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:8
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.64ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:16
MKL_VERBOSE ZGEMV(N,4096,4096,0x7fff21099d10,0x154a1f17b010,4096,0x1d59930,1,0x7fff21099d00,0x1d69940,1) 11.60ms CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:32
And here's a test result of only the mat-vec but with a larger matrix and vector:
MKL_VERBOSE oneMKL 2022.0 Product build 20211112 for Intel(R) 64 architecture Intel(R) Architecture processors, Lnx 1.79GHz lp64 gnu_thread
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.89s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:1
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.87s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:2
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:4
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:8
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:16
MKL_VERBOSE ZGEMV(N,65536,65536,0x7fff04973380,0x14f20a01e010,65536,0x1502125d9010,1,0x7fff04973370,0x14d209f1b010,1) 4.90s CNR:OFF Dyn:0 FastMM:1 TID:0 NThr:32
Edit 1: Manually parallelizing with OpenMP yields the results below, proving that multithread parallelization is profitable for 4 threads or more.
A(65536,65536)Time[s]= 4.71 NThr= 1
A(65536,65536)Time[s]= 6.46 NThr= 2
A(65536,65536)Time[s]= 3.15 NThr= 4
A(65536,65536)Time[s]= 1.71 NThr= 8
A(65536,65536)Time[s]= 1.18 NThr= 16
A(65536,65536)Time[s]= 1.21 NThr= 32