0

Here is the code, matrix multiplication in C, the matrix's fill is integer. Optimization for loop nest and loop unrolling. Run well till N is 7168. For N 8196, insufficient memory.If you run it, just screen, wait till get time result. This is an alarm from my computer (Asus A456UQ) or code is still wrong?

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include<time.h>
#include <sys/time.h>
#include<assert.h>

#define mat(m, col, row) \
m.data[row*m.cols + col]

#define N 8192
typedef struct
{
    int cols, rows;
    int *data;
}matrix_t;

void matrix_init(matrix_t *matrix, int cols, int rows)
{
    int i,j;
    matrix->cols = cols;
    matrix->rows = rows;
    matrix->data = malloc(sizeof(int)*cols*rows);

    if (!matrix->data)
    {
        fprintf(stderr, "Insufficient memory for a %dx%d matrix", cols, rows);
        exit(EXIT_FAILURE);
    }

    // 0 fill
    memset(matrix->data, 0, cols*rows);

}

void matrix_print(matrix_t *matrix)
{
    int i, n;

    n = matrix->cols * matrix->rows;

    printf("--M%dx%d--\n", matrix->rows, matrix->cols);
    for (i=0; i<n; i++)
    {
        printf("%d ", matrix->data[i]);

        if ((i+1)%matrix->cols == 0) printf("\n");
    }
}

void free_array(matrix_t *matrix) {
    free(matrix->data);
    matrix->data = NULL;
    matrix->cols = matrix->rows = 0;
}

void mul_matrix(matrix_t A, matrix_t B, matrix_t C){
    int ii, kk,ib,kb,j,i,k;
    int acc00, acc01, acc10,acc11;

    int size = A.cols;
    ib=16;
    kb=16;

    for (ii = 0; ii < size; ii += ib)
    {
        for (kk = 0; kk < size; kk += kb)
        {
            for (j=0; j < size; j += 2)
            {
                for(i = ii; i < ii + ib; i += 2 )
                {
                    if (kk == 0)
                        acc00 = acc01 = acc10 = acc11 = 0;
                    else
                    {
                        acc00 = mat(C,i,j);
                        acc01 = mat(C,i,(j+1));
                        acc10 = mat(C,(i+1),j);
                        acc11 = mat(C,(i+1),(j+1));
                    }
                }
                for (k = kk; k < kk + kb; k++)
                {
                    acc00 += mat(A,i,k) * mat(B,k,j);
                    acc01 += mat(A,i,k) * mat(B,k,j+1);
                    acc10 += mat(A,(i+1),k) * mat(B,k,j);
                    acc11 += mat(A,(i+1),k) * mat(B,k,(j+1));
                }

                mat(C,i,j) = acc00;
                mat(C,i,(j+1)) = acc01;
                mat(C,(i+1),j) = acc10;
                mat(C,(i+1),(j+1)) = acc11;

            }
        }
    }
}
/*
void mul_matrix(matrix_t A, matrix_t B, matrix_t C){
    int i,j,k;
    int t;
    int size = A.cols;


    for (int j=0; j<size; j++) {
       // iterates the j-th row of c
       for (int i=0; i<size; i++) {
           mat(C,i,j)=0;
       }

       // iterates the j-th row of b
       for (int k=0; k<size; k++) {
          t = mat(B,k,j);
          // iterates the j-th row of c
          // iterates the k-th row of a
          for (int i=0; i<size; i++) {
            mat(C,i,j) += mat(A,i,k) * t;
          }
       }
    }

 */

/*

    for (i = 0; i < size; i++){
        for (j = 0; j < size; j++) {
            mat(C,i,j) = 0;
            for (k = 0; k < size; k++)
                mat(C,i,j) += mat(A,i,k) * mat(B, k,j);
        }
    }
}
*/
void sum_matrix (matrix_t A, matrix_t B, matrix_t C){
    int i,j;
    int size = A.cols;

    for(i=0; i< size; i++) {
        for (j=0; j<size; j++)
            mat(C,i,j) = mat(A,i,j) + mat(B, i,j);
    }

}

void sub_matrix (matrix_t A, matrix_t B, matrix_t C){
    int i,j;
    int size = A.cols;

    for(i=0; i< size; i++) {
        for (j=0; j<size; j++)
            mat(C,i,j) = mat(A,i,j) - mat(B, i,j);
    }

}

int main(int argc, char **argv)
{
    matrix_t x;
    matrix_t y;
    matrix_t z;
    matrix_t m1,m2,m3,m4,m5,m6,m7;
    matrix_t a11,a12,a21,a22;
    matrix_t b11,b12,b21,b22;
    matrix_t aresult, bresult;
    matrix_t c11,c12,c21,c22;



    int i,j,k =0;

    int half = N/2;

    matrix_init(&x, N, N);
    matrix_init(&y, N, N);
    matrix_init(&z, N, N);
    //for(i = 0; i<N; i++)
      //  for(j = 0; j<N; j++)
         //   mat(z,i,j) =0;
    //matrix_print(&z);

    matrix_init(&a11, half, half);
    matrix_init(&a12, half, half);
    matrix_init(&a21, half, half);
    matrix_init(&a22, half, half);
    matrix_init(&b11, half, half);
    matrix_init(&b12, half, half);
    matrix_init(&b21, half, half);
    matrix_init(&b22, half, half);

    matrix_init(&aresult,half,half);
    matrix_init(&bresult,half,half);

    matrix_init(&m1, half, half);
    matrix_init(&m2, half, half);
    matrix_init(&m3, half, half);
    matrix_init(&m4, half, half);
    matrix_init(&m5, half, half);
    matrix_init(&m6, half, half);
    matrix_init(&m7, half, half);

    matrix_init(&c11, half, half);
    matrix_init(&c12, half, half);
    matrix_init(&c21, half, half);
    matrix_init(&c22, half, half);


    struct timeval  tv1, tv2;

    //srand((unsigned)time(NULL));


    for(i = 0; i<N; i++)
        for(j = 0; j<N; j++)
            mat(x,i,j) =1;// rand() % 10;

    for(i = 0; i<N; i++)
        for(j = 0; j<N; j++)
            mat(y,i,j) =1;// rand() % 10;




    gettimeofday(&tv1, NULL);


    //dividing the matrices in 4 sub-matrices:
    for(i=0; i<half; i++){
        for (j=0; j<half;j++){
            mat(a11,i,j)= mat(x,i,j);
            mat(a12,i,j)= mat(x,i,j+half);
            mat(a21,i,j)= mat(x,i+half,j);
            mat(a22,i,j)= mat(x,i+half,j+half);

            mat(b11,i,j)= mat(y,i,j);
            mat(b12,i,j)= mat(y,i,j+half);
            mat(b21,i,j)= mat(y,i+half, j);
            mat(b22,i,j)= mat(y, i+half, j+half);
        }
    }
    //matrix_print(&a11);
    //matrix_print(&a12);
    //matrix_print(&a21);
    //matrix_print(&a22);

    // Calculating m1 to m7:
    sum_matrix(a11,a22,aresult);
    sum_matrix(b11,b22,bresult);
    mul_matrix(aresult,bresult,m1);

    sum_matrix(a21,a22,aresult);
    mul_matrix(aresult,b11,m2);

    sub_matrix(b12,b22,aresult);
    mul_matrix(a11,aresult,m3);

    sub_matrix(b21,b11, aresult);
    mul_matrix(a22,aresult,m4);

    sum_matrix(a11,a12,aresult);
    mul_matrix(aresult,b22,m5);

    sub_matrix(a21,a11,aresult);
    sum_matrix(b11,b12,bresult);
    mul_matrix(aresult,bresult,m6);

    sub_matrix(a12,a22,aresult);
    sum_matrix(b21,b22,bresult);
    mul_matrix(aresult,bresult,m7);

    sum_matrix(m1,m4,aresult);
    sub_matrix(aresult,m5,bresult);
    sum_matrix(bresult,m7,c11);

    sum_matrix(m3,m5,c12);
    sum_matrix(m2,m4,c21);
    sub_matrix(m1,m2, aresult);
    sum_matrix(aresult,m3,bresult);
    sum_matrix(bresult,m6,c22);

    //matrix_print(&c11);
    //matrix_print(&c12);
    //matrix_print(&c21);
    //matrix_print(&c22);

    //combine matrix

    for(i=0; i < half; i++){
        for (j=0; j<half;j++){
            mat(z,i,j)= mat(c11,i,j);
            mat(z,(i+half),j)= mat(c12,i,j);
            mat(z,i,(j+half))= mat(c21,i,j);
            mat(z,(i+half),(j+half))= mat(c22,i,j);
       }
    }









    gettimeofday(&tv2, NULL);

    //matrix_print(&x);
    //matrix_print(&y);
    //matrix_print(&z);
    printf ("\n \nTotal time = %f seconds\n",(double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + (double) (tv2.tv_sec - tv1.tv_sec));

    free_array(&x);
    free_array(&y);
    free_array(&z);

    free_array(&m1);
    free_array(&m2);
    free_array(&m3);
    free_array(&m4);
    free_array(&m5);
    free_array(&m6);
    free_array(&m7);

    free_array(&a11);
    free_array(&a12);
    free_array(&a21);
    free_array(&a22);
    free_array(&b11);
    free_array(&b12);
    free_array(&b21);
    free_array(&b22);

    free_array(&aresult);
    free_array(&bresult);

    free_array(&c11);
    free_array(&c12);
    free_array(&c21);
    free_array(&c22);
    return 0;
}
Vasilis G.
  • 7,556
  • 4
  • 19
  • 29
afni
  • 75
  • 2
  • 12
  • I get this output: `Insufficient memory for a 4096x4096`. It's pretty explicit, where is the problem? – Jabberwocky Nov 16 '17 at 14:20
  • 1
    You appear to have 24 matrices, if I count correctly. If each of those is 4K x 4K of `int`, then each one is about 64 MiB. That’s about 1.5 GiB in total. Do you have that much memory around? Desktop machines typically won’t have much problem with that. Embedded systems may have more difficulty with that much space. – Jonathan Leffler Nov 16 '17 at 14:25
  • the code just for N is 2^x, can't run if it doesn't cause will divide the matrix for half submatrices. – afni Nov 16 '17 at 14:26
  • 2
    `memset(matrix->data, 0, cols*rows);` -->> `memset(matrix->data, 0, cols*rows* sizeof *matrix->data);` ... there you go ... – joop Nov 16 '17 at 14:34
  • There was briefly a comment in the question about N must be a power of 2. If that’s correct, you can’t test with N = 7168, can you? – Jonathan Leffler Nov 16 '17 at 14:34
  • If you’re doing `malloc` plus `memset` to zero, consider using `calloc` instead. – Jonathan Leffler Nov 16 '17 at 14:40
  • @afni your code appears to run fine, it's just running on my computer. There are simply a lot of computations and therefore it is just slow but not stuck. I'll put another comment when it has finished. – Jabberwocky Nov 16 '17 at 14:41
  • 1
    @afni OK, the program has finished after roughly 7 minutes with `N = 7168`. Put a `printf("i = %d\n", ii);` at the end of the `for (ii = 0; ii < size; ii += ib)` loop in `mul_matrix` and you'll be able to see how it progresses. – Jabberwocky Nov 16 '17 at 14:50
  • @afni It's still a bit unclear what your actual problem is. Are you wondering why you get the message `Insufficient memory for a 4096x4096`? – Jabberwocky Nov 16 '17 at 15:43
  • @Michael Walz, actually, I want a solution for insufficient memory if matrix size 4096x4096 can run in my computer. that's it. – afni Nov 18 '17 at 04:57
  • @afni quick answer: buy more memory, it's cheap nowadays. And/or compile your program as 64 bit program (this is only possible if your operation system is 64 bit). – Jabberwocky Nov 19 '17 at 13:04

0 Answers0