4

This has been puzzling me for a few weeks, and no one has been able to give some kind of answer. Take it as a Christmas treat, and please post any explanation you might have...

I was happy to show my students that initializing an matrix (in C) was slower when done "column wise".

#define NB_TESTS 100
#define SIZE 512
int M[SIZE][SIZE];
for (int n=0; n<NB_TESTS; n++) {
    for (int i=0; i<SIZE; i++) {
        for (int j=0; j<SIZE; j++) {
            M[j][i] = 0;
        }
    }
}

is about 3 times slower than the usual version where you initialize M[i][j] in the loop.

So far so good...

Here is the first puzzling observation. If I replace

 #define SIZE 512

by

#define SIZE 513

or

#define SIZE 511

the difference goes away.

The second, more puzzling, puzzling observation is that in this case, it is not the usual initialisation that is less efficient. I would have filled that in the "powers of 2 are great". The slow reversed version of the initialisation goes as fast as the normal one!

What's your take on that?

Notes:

  • I am running debian,

  • the same thing happens with gcc (6.3) and clang (3.8)

  • allocating in the heap or the stack doesn't seem to make a difference

  • optimization options for gcc / clang do not make a difference (besides making both versions faster)

  • apparently, the assembly generated doesn't do anything strange (says a colleague)

  • on bigger sizes, the normal initialisation is faster. The strange fact that the reverse initialisation is slower (by 25% for SIZE of 4096) on powers of two than on other integers remains!

Here is the full code I used while testing most of those things:

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>


int duration(struct timeval start, struct timeval stop) {
    return (stop.tv_sec - start.tv_sec) * 1000 + (stop.tv_usec - start.tv_usec) / 1000;
}

int *zero_square_matrix(int size, int *ms, int nb_init) {
    struct timeval start, stop;
    int *M = malloc(size*size*sizeof(int));

    gettimeofday(&start, NULL);
    int i,j;
    for(int k=0; k<nb_init; k++) {
        for (i=0; i<size; i++) {
            for (j=0; j<size; j++) {
                M[i*size + j] = 0;
            }
        }
    }
    gettimeofday(&stop, NULL);

    *ms = duration(start, stop);
    return M;

}

int *zero_square_matrix_rev(int size, int *ms, int nb_init) {
    struct timeval start, stop;
    int *M = malloc(size*size*sizeof(int));

    gettimeofday(&start, NULL);
    int i,j;
    for(int k=0; k<nb_init; k++) {
        for (i=0; i<size; i++) {
            for (j=0; j<size; j++) {
                M[j*size + i] = 0;
            }
        }
    }
    gettimeofday(&stop, NULL);

    *ms = duration(start, stop);
    return M;

}

int main(int argc, char **argv) {
    int nb_tests;
    int size;
    if (argc == 1 || argc > 3) {
        printf("usage %s SIZE [NB_TESTS]\n", argv[0]);
        return 1;
    }
    if (argc == 2) {
        size=atoi(argv[1]);
        nb_tests = 100;
    } else if (argc == 3) {
        size=atoi(argv[1]);
        nb_tests = atoi(argv[2]);
    }

    int *M;
    int ms1,ms2;
    printf("SIZE      \t\t\t  NORMAL  \t\t\t  REVERSE\n");
    for (int i=size-3; i<size+3; i+=1) {
        M = zero_square_matrix(i, &ms1, nb_tests);
        M = zero_square_matrix_rev(i, &ms2, nb_tests);
        printf("%dx%d  \t\t\t  %d  \t\t\t  %d\n", i, i, ms1, ms2);
        free(M);
    }
    return 0;
}
Pierre H.
  • 41
  • 1

0 Answers0