I try to accelerate simple MPI-programm with OpenMP. I use MPICH2 and 4-core Intel processor. I have simple code:
int main(int argc, char** argv) {
int size, rank, provided;
const int root = 0;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int cubeCount = StrToDouble(argv[1]);
int matrixSize = *StrToDouble(argv[2]);
WorkNode node(rank, size, cubeCount, matrixSize);
time_t t0 = time(0);
node.work();
time_t t1 = time(0);
time_t total = t1 - t0;
Class WorkNode also very simple, contains only array of Cube and method work.
class Cube {
public:
Cube(int matrixSize);
double *matrix;
int matrixSize;
}
Cube::Cube(int matrixSize) {
matrix = new double[matrixSize];
this->matrixSize = matrixSize;
}
Finally method work:
// double *inBuffer = new double[cubes[0]->matrixSize];
MPI_Status status;
for (int i = 0; i < processorCount; i++) {
int nodeToSend = this->id + i;
int nodeRecv = this->id - i;
if (nodeToSend >= processorCount) {
nodeToSend -= processorCount;
}
if (nodeRecv < 0) {
nodeRecv += processorCount;
}
#pragma omp parallel for num_threads(2)
for (int i = 0; i < cubeCount; i++) {
Cube *cube = cubes[i];
if (nodeToSend != this->id) {
MPI_Bsend(cube->matrix, cube->matrixSize, MPI_DOUBLE, nodeToSend, _MY_MPI_ANY_TAG, MPI_COMM_WORLD);
}
if (nodeRecv != this->id) {
double *inBuffer = new double[cubes[0]->matrixSize];
MPI_Recv(inBuffer, cube->matrixSize, MPI_DOUBLE, nodeRecv, _MY_MPI_ANY_TAG, MPI_COMM_WORLD, &status);
delete inBuffer;
}
}
}
//delete inBuffer
Unfortunately, openMP does not accelerate the program (even if the number of MPI processes = 2), and sometimes even slows down. Can I somehow accelerate MPI calls?