I want to vectorize the fortran below with SIMD directives
!DIR$ SIMD
DO IELEM = 1 , NELEM
X(IKLE(IELEM)) = X(IKLE(IELEM)) + W(IELEM)
ENDDO
And I used the instruction avx2. The program is compiled by
ifort main_vec.f -simd -g -pg -O2 -vec-report6 -o vec.out -xcore-avx2 -align array32byte
Then I'd like to add VECTORLENGTH(n)
clause after SIMD
.
If there's no such a clause or n = 2, 4, the information doesn't give information about the unroll factor
if n = 8, 16, vectorization support: unroll factor set to 2
.
I've read Intel's article about vectorization support: unroll factor set to xxxx So I guess the loop is unrolled to something like:
DO IELEM = 1 , NELEM, 2
X(IKLE(IELEM)) = X(IKLE(IELEM)) + W(IELEM)
X(IKLE(IELEM+1)) = X(IKLE(IELEM+1)) + W(IELEM+1)
ENDDO
Then 2 X go into a vector register, 2 W go to another, do the addition. But how does the value of VECTORLENGTH work? Or maybe I don't really understand what does the vector length mean.
And since I use the avx2 instruction, for the DOUBLE PRECISION
type X
, what's the maximum length could be reach?
Here's part of the assembly of the loop with SSE2, VL=8 and the compiler told me that unroll factor is 2. However it used 4 registers instead of 2.
.loc 1 114 is_stmt 1
movslq main_vec_$IKLE.0.1(,%rdx,4), %rsi #114.9
..LN202:
movslq 4+main_vec_$IKLE.0.1(,%rdx,4), %rdi #114.9
..LN203:
movslq 8+main_vec_$IKLE.0.1(,%rdx,4), %r8 #114.9
..LN204:
movslq 12+main_vec_$IKLE.0.1(,%rdx,4), %r9 #114.9
..LN205:
movsd -8+main_vec_$X.0.1(,%rsi,8), %xmm0 #114.26
..LN206:
movslq 16+main_vec_$IKLE.0.1(,%rdx,4), %r10 #114.9
..LN207:
movhpd -8+main_vec_$X.0.1(,%rdi,8), %xmm0 #114.26
..LN208:
movslq 20+main_vec_$IKLE.0.1(,%rdx,4), %r11 #114.9
..LN209:
movsd -8+main_vec_$X.0.1(,%r8,8), %xmm1 #114.26
..LN210:
movslq 24+main_vec_$IKLE.0.1(,%rdx,4), %r14 #114.9
..LN211:
addpd main_vec_$W.0.1(,%rdx,8), %xmm0 #114.9
..LN212:
movhpd -8+main_vec_$X.0.1(,%r9,8), %xmm1 #114.26
..LN213:
..LN214:
movslq 28+main_vec_$IKLE.0.1(,%rdx,4), %r15 #114.9
..LN215:
movsd -8+main_vec_$X.0.1(,%r10,8), %xmm2 #114.26
..LN216:
addpd 16+main_vec_$W.0.1(,%rdx,8), %xmm1 #114.9
..LN217:
movhpd -8+main_vec_$X.0.1(,%r11,8), %xmm2 #114.26
..LN218:
..LN219:
movsd -8+main_vec_$X.0.1(,%r14,8), %xmm3 #114.26
..LN220:
addpd 32+main_vec_$W.0.1(,%rdx,8), %xmm2 #114.9
..LN221:
movhpd -8+main_vec_$X.0.1(,%r15,8), %xmm3 #114.26
..LN222:
..LN223:
addpd 48+main_vec_$W.0.1(,%rdx,8), %xmm3 #114.9
..LN224:
movsd %xmm0, -8+main_vec_$X.0.1(,%rsi,8) #114.9
..LN225:
.loc 1 113 is_stmt 1
addq $8, %rdx #113.7
..LN226:
.loc 1 114 is_stmt 1
psrldq $8, %xmm0 #114.9
..LN227:
.loc 1 113 is_stmt 1
cmpq $26000, %rdx #113.7
..LN228:
.loc 1 114 is_stmt 1
movsd %xmm0, -8+main_vec_$X.0.1(,%rdi,8) #114.9
..LN229:
movsd %xmm1, -8+main_vec_$X.0.1(,%r8,8) #114.9
..LN230:
psrldq $8, %xmm1 #114.9
..LN231:
movsd %xmm1, -8+main_vec_$X.0.1(,%r9,8) #114.9
..LN232:
movsd %xmm2, -8+main_vec_$X.0.1(,%r10,8) #114.9
..LN233:
psrldq $8, %xmm2 #114.9
..LN234:
movsd %xmm2, -8+main_vec_$X.0.1(,%r11,8) #114.9
..LN235:
movsd %xmm3, -8+main_vec_$X.0.1(,%r14,8) #114.9
..LN236:
psrldq $8, %xmm3 #114.9
..LN237:
movsd %xmm3, -8+main_vec_$X.0.1(,%r15,8) #114.9
..LN238: