Need help debugging parallel matrix multiplication using MPI

Question

I'm currently writing a program in C using MPI to perform matrix multiplication in parallel. I'm very new to C and MPI, so it's a pretty rough code. I can't seem to get my code to work, so could someone help me read through it and help me understand what I need to do to fix it?

Here's the code:

#include <stdio.h>
#include <stdlib.h> 
#include <time.h>
#include <mpi.h>

// code adapted from source codes from
//  http://www.programiz.com/c-programming/c-multi-dimensional-arrays
//  http://www.cs.hofstra.edu/~cscccl/csc145/imul.c


// GENERAL VARIABLES
int **A, **B, **AB;
int i,j,k;
int rows_A, cols_A, rows_B, cols_B;
int dimensions[3];

// MATRIX MULTIPLICATION
void matrixMult(int start, int interval){
for (i = start; i < start+interval; ++i){
    for (j = 0; j < cols_B; ++j){
        for (k = 0; k < cols_A; ++k)
            AB[i][j] += (A[i][k] * B[k][j]);}}}


int main(int argc, char *argv[]){
// MPI VARIABLES, INITIALIZE MPI
int rank, size, interval, remainder;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);

if (rank == 0){

    // READ AND WRITE MATRICES ------------------------------------
    FILE *matrix1, *matrix2;
    matrix1 = fopen("matrix1", "r");
    fscanf(matrix1, "%d", &rows_A);
    fscanf(matrix1, "%d", &cols_A);

    matrix2 = fopen("matrix2", "r");
    fscanf(matrix2, "%d", &rows_B);
    fscanf(matrix2, "%d", &cols_B);

    int dimensions[3] = {rows_A, cols_A, cols_B};

    /*printf("\n\nRows A = %d",rows_A);
    printf("\nCols A = %d",cols_A);
    printf("\n\nRows B = %d",rows_B);
    printf("\nCols B = %d",cols_B);*/

    // Allocate memory for matrices
    int **A = malloc(rows_A * sizeof(int*));
    // The cast to size_t prevents integer overflow with big matrices
    A[0] = malloc((size_t)rows_A * (size_t)cols_A * sizeof(int));
    for(i = 1; i < rows_A; i++)
            A[i] = A[0] + i*cols_A;

    int **B = malloc(rows_B * sizeof(int*));
    // The cast to size_t prevents integer overflow with big matrices
    B[0] = malloc((size_t)rows_B * (size_t)cols_B * sizeof(int));
    for(i = 1; i < rows_A; i++)
            B[i] = B[0] + i*cols_B;

    int **AB = malloc(rows_A * sizeof(int*));
    // The cast to size_t prevents integer overflow with big matrices
    AB[0] = malloc((size_t)rows_A * (size_t)cols_B * sizeof(int));
    for(i = 1; i < rows_A; i++)
            AB[i] = AB[0] + i*cols_B;


    /*int **A = (int **)malloc(rows_A * sizeof(int*));
    for(i = 0; i < rows_A; i++)
        A[i] = (int *)malloc(cols_A * sizeof(int));

    int **B = (int **)malloc(rows_B * sizeof(int*));
    for(i = 0; i < rows_B; i++)
        B[i] = (int *)malloc(cols_B * sizeof(int));

    int **AB = (int **)malloc(rows_A * sizeof(int*));
    for(i = 0; i < rows_B; i++)
        AB[i] = (int *)malloc(cols_B * sizeof(int));*/


    // Write matrices
    while(!feof(matrix1)){
        for(i=0;i<rows_A;i++){
            for(j=0;j<cols_A;j++)
                fscanf(matrix1,"%d",&A[i][j]);}}

    while(!feof(matrix2)){
    for(i=0;i<rows_B;i++){
        for(j=0;j<cols_B;j++)
            fscanf(matrix2,"%d",&B[i][j]);}}

    /*
    // Print Matrices
    printf("\n\n");
    //print matrix 1
    printf("Matrix A:\n");
    for(i=0;i<rows_A;i++){
        for(j=0;j<cols_A;j++)
            printf("%d\t",A[i][j]);
        printf("\n");}
    printf("\n");
    //print matrix 2
    printf("Matrix B:\n");
    for(i=0;i<rows_B;i++){
        for(j=0;j<cols_B;j++)
            printf("%d\t",B[i][j]);
        printf("\n");} */
    // ------------------------------------------------------------------




    // MULTIPLICATION (Parallelize here)

    printf("begin rank 0\n");

    interval = rows_A / size; // work per processor
    remainder = rows_A % size;

    // SEND B BROADCAST to all
    MPI_Bcast(B, rows_B * cols_B, MPI_INT, 0, MPI_COMM_WORLD);
    printf("1\n");
    // SEND A, ROWS, COLS, interval to each rank
    for(i=1;i<size;i++)
        MPI_Send(dimensions,3,MPI_INT,i,123,MPI_COMM_WORLD);
    printf("2\n");
    for(i=1;i<size;i++)
        MPI_Send(A[i*interval],interval*rows_A,MPI_INT,i,123,MPI_COMM_WORLD);
    printf("3\n");

    // ROOT MM
    matrixMult(0, interval);
    printf("3.5\n");
    matrixMult(size * interval, remainder);
    printf("4\n");

    // receive AB from workers, add to current AB
    for(i=1;i<size;i++)
        MPI_Recv(AB[i*interval],interval*rows_A,MPI_INT,i,123,MPI_COMM_WORLD, MPI_STATUS_IGNORE);
    printf("5\n");





    // PRINT MATRIX PRODUCT
    printf("\nSum Of Matrix:\n");
    for(i = 0; i < rows_A; ++i){
        for(j = 0; j < cols_B; ++j){
            printf("%d\t",AB[i][j]);  
            if(j == cols_B - 1)/* To display matrix sum in order. */
                printf("\n");}}

    // CLOSE FILES
    fclose(matrix1);
    fclose(matrix2);



}

else{ // WORKER NODES
    printf("bring workers\n");
    // RECEIVE B BROADCAST
    MPI_Bcast(B, rows_B * cols_B, MPI_INT, 0, MPI_COMM_WORLD);
    printf("a\n");
    // RECEIVE A, INTERVAL
    MPI_Recv(dimensions,3,MPI_INT,0,123, MPI_COMM_WORLD,MPI_STATUS_IGNORE);
    printf("b\n");
    rows_A = dimensions[0];
    cols_A = dimensions[1];
    cols_B = dimensions[2];
    printf("c\n");
    MPI_Recv(A[rank*interval],interval*rows_A,MPI_INT,0,123, MPI_COMM_WORLD,MPI_STATUS_IGNORE);
    printf("d\n");

    // WORKER MM
    matrixMult(rank*interval, interval); 
    printf("e\n");

    // send AB to root
    MPI_Send(AB[rank*interval],interval*rows_A,MPI_INT,0,123,MPI_COMM_WORLD);
    printf("f\n");
}

// FINALIZE MPI
MPI_Finalize();  /* EXIT MPI */

}

I stuck in some prints to try to understand where my code was failing and it looks like it gets to the actual matrix multiplication part in the workers and in the rank 0 root. Does that mean it's a problem with my receive? The input is a 2x3 matrix of 1 2 3 4 5 6 and a 3x2 of 7 8 9 10 11 12 Here's what the output looks like:

hjiang1@cook:~/cs287/PMatrixMultiply$ make
mpicc parallelMatrixMult.c -std=c99 -lm -o parallelMatrix.out
hjiang1@cook:~/cs287/PMatrixMultiply$ mpirun --hostfile QuaCS parallelMatrix.out
No protocol specified
No protocol specified
bring workers
a
bring workers
a
bring workers
a
begin rank 0
1
2
b
c
b
c
b
c
3
d
e
d
3.5
[cook:06730] *** Process received signal ***
[cook:06730] Signal: Segmentation fault (11)
[cook:06730] Signal code: Address not mapped (1)
[cook:06730] Failing at address: 0xffffffffbbc4d600
[cook:06728] *** Process received signal ***
[cook:06728] Signal: Segmentation fault (11)
[cook:06728] Signal code: Address not mapped (1)
[cook:06728] Failing at address: 0x5d99f200
[cook:06727] *** Process received signal ***
[cook:06730] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0)[0x7fdaa80eccb0]
[cook:06730] [ 1] [cook:06728] [ 0] /lib/x86_64-linux-gnu/libc.so.6(+0x147b55)[0x7fdaa7e65b55]
[cook:06730] [ 2] /usr/local/lib/openmpi/mca_btl_vader.so(+0x23f9)[0x7fda9e70f3f9]
[cook:06730] [ 3] /usr/local/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send_request_start_rndv+0x1d3)[0x7fda9e0df393]
[cook:06730] [ 4] /usr/local/lib/openmpi/mca_pml_ob1.so(mca_pml_ob1_send+0x754)[0x7fda9e0d5404]
[cook:06730] [ 5] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0)[0x7f910bef2cb0]
[cook:06728] [ 1] parallelMatrix.out[0x400bad]
[cook:06728] [ 2] parallelMatrix.out[0x401448]
[cook:06728] [ 3] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xed)[0x7f910bb4576d]
[cook:06728] [ 4] parallelMatrix.out[0x400a79]
[cook:06728] *** End of error message ***
/usr/local/lib/libmpi.so.1(PMPI_Send+0xf2)[0x7fdaa8368332]
[cook:06730] [ 6] parallelMatrix.out[0x401492]
[cook:06730] [ 7] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xed)[0x7fdaa7d3f76d]
[cook:06730] [ 8] parallelMatrix.out[0x400a79]
[cook:06730] *** End of error message ***
[cook:06727] Signal: Segmentation fault (11)
[cook:06727] Signal code: Address not mapped (1)
[cook:06727] Failing at address: (nil)
[cook:06727] [ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0xfcb0)[0x7f73e0d09cb0]
[cook:06727] [ 1] parallelMatrix.out[0x400bad]
[cook:06727] [ 2] [cook:6729] *** An error occurred in MPI_Recv
[cook:6729] *** reported by process [1864040449,2]
[cook:6729] *** on communicator MPI_COMM_WORLD
[cook:6729] *** MPI_ERR_COUNT: invalid count argument
[cook:6729] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[cook:6729] ***    and potentially your MPI job)

If anyone can help that'd be greatly appreciated. Again, I'm new to C and MPI so bear with me on how terrible my code is.

score 4 · Answer 1 · answered Apr 09 '15 at 08:05

It is the same error I see repeated over and over again. When working with MPI, use flat arrays, i.e. allocate the matrix as a contiguous block of memory instead of allocating each row separately, i.e. instead of:

int **A = (int **)malloc(rows_A * sizeof(int*));
for(i = 0; i < rows_A; i++)
    A[i] = (int *)malloc(cols_A * sizeof(int));

you should use:

int **A = malloc(rows_A * sizeof(int*));
// The cast to size_t prevents integer overflow with big matrices
A[0] = malloc((size_t)rows_A * (size_t)cols_A * sizeof(int));
for(i = 1; i < rows_A; i++)
    A[i] = A[0] + i*cols_A;

Freeing such a matrix goes like:

free(A[0]);
free(A);

That said, there is another class of errors in your code:

MPI_Recv(A+(i*interval), ...);
MPI_Send(A+(i*interval), ...);

A is an array of pointers to each row. A+i is a pointer to the i-th element of that array. Therefore, you are passing MPI not the actual address of the row data in memory, but a pointer to a pointer to that data. The proper expression (given that you've allocated the memory in a single block as outlined earlier) is either:

MPI_Recv(A[i*interval], ...);

or

MPI_Recv(*(A + i*interval), ...);

In other words, array[index] is equivalent to *(array + index) and not to array + index.

This probably is not the right venue but I've been thinking about "make mpi easier" lately: if this way of allocating a 2d array is so common, shouldn’t there be an easy way to specify it in MPI? Hindexed-block would describe the 'rows_A' arrays of size 'cols_B'. in 25 years I'm surprised there isn't yet an "MPI-Sugar" that could just do this for our poor users. — Rob Latham, Apr 09 '15 at 17:53
One could easily construct a datatype that describes a **single** instance of such a matrix (e.g. by using absolute addresses in combination with `MPI_BOTTOM`), but it would be only usable for that particular single instance. The type model of MPI which consists of tuples of (_basic-type_, _offset_) does not have a notion of dereferencing pointers. — Hristo Iliev, Apr 09 '15 at 18:17
Thank you for the quick response. I tried the fixes, but I still end up getting the same errors. Is there a logical mistake in my code? — Sam Jiang, Apr 10 '15 at 01:52
Edit your question an replace the code with the one after the fixes. — Hristo Iliev, Apr 10 '15 at 08:40

score 0 · Answer 2 · answered Apr 10 '15 at 12:57

0

If you are familiar with gdb, remember that always you can use still use it for debug MPI

mpirun -np 4 xterm -e gdb my_mpi_application

this will open 4 terminals, from where you can use gdb per each process.

answered Apr 10 '15 at 12:57

cmaureir

285
2
8

score 0 · Answer 3 · answered Jan 06 '17 at 08:59

It seems you allocate memory only on the root process

if (rank == 0){

// READ AND WRITE MATRICES ------------------------------------
// Allocate memory for matrices
int **A = malloc(rows_A * sizeof(int*));
// The cast to size_t prevents integer overflow with big matrices
A[0] = malloc((size_t)rows_A * (size_t)cols_A * sizeof(int));
int **B = malloc(rows_B * sizeof(int*));
// The cast to size_t prevents integer overflow with big matrices
B[0] = malloc((size_t)rows_B * (size_t)cols_B * sizeof(int));
int **AB = malloc(rows_A * sizeof(int*));
// The cast to size_t prevents integer overflow with big matrices
AB[0] = malloc((size_t)rows_A * (size_t)cols_B * sizeof(int));

The first thing that I can suggest you is to separate matrix parameters reading from the allocation. Then after you read size of matrices you should broadcast it and then allocate it on all processes.

Also by declaring int** A in the rank==0 condition you are hiding the declaration of A in the beginning of you code.

something like this:

if(rank == 0) {
  // Read rows_A, cols_A, rows_B, cols_B
  ....
}

MPI_Bcast(rows_A, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(rows_B, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(cols_A, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(cols_B, 1, MPI_INT, 0, MPI_COMM_WORLD);
// allocate memory
....
if(rank == 0) {
   // read matrix
   ....
}
// broadcast matrices
....

Need help debugging parallel matrix multiplication using MPI

3 Answers3