For fast XORing two regions of memory, I wrote a function(region_xor_avx()
) with AVX instructions optimized. However, the program met a core dump error at _mm256_xor_si256()
. Here is a short self-contained example:
#include <stdlib.h>
#include <stdio.h>
#include <immintrin.h>
int region_xor_avx(void *dst, void *src, int len){
int k;
int len256 = len/32;
__m256i *_buf1 = (__m256i *)src;
__m256i *_buf2 = (__m256i *)dst;
for(k = 0; k < len256; ++k){
_buf2[k] = _mm256_xor_si256(_buf1[k], _buf2[k]);
}
return 1;
}
int main(){
int i;
int arr1[8] = {1, 2, 3, 4, 5, 6, 7, 8};
int arr2[8] = {0, 1, 2, 3, 4, 5, 6, 7};
int arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
int *psrc;
int *pdes1, *pdes2;
psrc = arr1;
pdes1 = arr2;
pdes2 = arr3;
for(i = 0; i < 8; ++i){
pdes1[i] = pdes1[i]^psrc[i];
}
region_xor_avx(pdes2, psrc, 8*sizeof(int));
if(memcmp(pdes1, pdes2, 8*sizeof(int)) == 0){
printf("equal!\n");
}else{
printf("Not equal!\n");
}
return 1;
}
My CPU is Intel(R) Core(TM) i7-4770K supporting AVX instructions.
My compiler is gcc (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
and the compiler option is -g -mavx2