Basically, I'm taking a course called parallel programming. However, I have no experience programming in C and am not too knowledgeable in computer architecture.
I know that cache is faster than memory... However, I have no idea how these concepts relate to C programming.
anyways my task is to make a fast version of matrix multiplication. From what I've googled online I should be using SIMD SSE, which is a interface that allows you to execute multiple operations at once and use loop unrolling.
However, when I try it, I keep getting a segmentaiton fault. I used printf() to find where it occurs, but I still don't understand why or what to do afterward.
#include <emmintrin.h>
const char* dgemm_desc = "Simple blocked dgemm.";
#if !defined(BLOCK_SIZE)
#define BLOCK_SIZE 41
#endif
#define min(a,b) (((a)<(b))?(a):(b))
void do_block_fast (int lda, int M, int N, int K, double* A, double* B, double* C)
{
static double a[BLOCK_SIZE*BLOCK_SIZE] __attribute__ ((aligned (16)));
static double temp[1] __attribute__ ((aligned (16)));
__m128d vec1;
__m128d vec2;
__m128d vec3;
__m128d vec4;
// make a local aligned copy of A's block
for( int j = 0; j < K; j++ )
for( int i = 0; i < M; i++ )
a[i+j*BLOCK_SIZE] = A[i+j*lda];
/* For each row i of A */
for (int i = 0; i < M; ++i)
/* For each column j of B */
for (int j = 0; j < N; ++j)
{
/* Compute C(i,j) */
double cij = C[i+j*lda];
for (int k = 0; k < K; k += 2){
printf("0");
vec1 = _mm_load_pd(&a[i+k*BLOCK_SIZE]);
printf("1");
vec2 = _mm_loadu_pd (&B[k+j*lda]);
printf("2");
vec3 = _mm_mul_pd(vec1, vec2);
printf("3");
_mm_storeu_pd(&temp[0], vec3);
printf("4"); // SEGMENTATION fault occurs right after 4 is printed
cij += temp[0];
printf("5");
}
printf("5");
C[i+j*lda] = cij;
}
}
The error occurs after printf("4"); However I'm unsure why. I've tried different (alligned (x))) versions of the temp[] array. I even tried replacing it with a regular variable. But the segmentation fault still happens. i Here's the main routine that calls do_block_fast()
/* This routine performs a dgemm operation
* * C := C + A * B
* * where A, B, and C are lda-by-lda matrices stored in column-major format.
* * On exit, A and B maintain their input values. */
void square_dgemm (int lda, double* A, double* B, double* C)
{
/* For each block-row of A */
for (int i = 0; i < lda; i += BLOCK_SIZE)
/* For each block-column of B */
for (int j = 0; j < lda; j += BLOCK_SIZE)
/* Accumulate block dgemms into block of C */
for (int k = 0; k < lda; k += BLOCK_SIZE)
{
/* Correct block dimensions if block "goes off edge of" the matrix */
int M = min (BLOCK_SIZE, lda-i);
int N = min (BLOCK_SIZE, lda-j);
int K = min (BLOCK_SIZE, lda-k);
/* Perform individual block dgemm */
if((M % BLOCK_SIZE == 0) && (N % BLOCK_SIZE == 0) && (K % BLOCK_SIZE == 0))
{
do_block_fast(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
}else{
do_block(lda, M, N, K, A + i + k*lda, B + k + j*lda, C + i + j*lda);
}
}
}