-1

First, I precise that I am french and my english is not really good.

I am working on MPI application and I have some problems and I hope that somebody can help me.

As reported in the title of my post, I try to use a thread to listen when I have to kill my application and then call the MPI_Finalize function.

However, my application does not finish correcty. More precisely, I obtain the following message:

[XPS-2720:27441] * Process received signal *

[XPS-2720:27441] Signal: Segmentation fault (11)

[XPS-2720:27441] Signal code: Address not mapped (1)

[XPS-2720:27441] Failing at address: 0x7f14077a3b6d

[XPS-2720:27440] * Process received signal *

[XPS-2720:27440] Signal: Segmentation fault (11)

[XPS-2720:27440] Signal code: Address not mapped (1)

[XPS-2720:27440] Failing at address: 0x7fb11d07bb6d


mpirun noticed that process rank 1 with PID 27440 on node lagniez-XPS-2720 exited on signal 11 (Segmentation fault).

My slave code is:

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <unistd.h>
#include <sys/types.h>
#include <pthread.h>
#include <cassert>


#define send_data_tag 1664
#define send_kill_tag 666

void *finilizeMPICom(void *intercomm)
{ 
  printf("the finilizeMPICom was called\n");
  
  MPI_Comm parentcomm = * ((MPI_Comm *) intercomm);
  MPI_Status status;
  int res;

  // sleep(10);
  MPI_Recv(&res, 1, MPI_INT, 0, send_kill_tag, parentcomm, &status);

  int rank;
  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  printf("we receive something %d -- %d\n", rank, res);
  
  MPI_Finalize();
  exit(0);
}// finilizeMPICom


int main( int argc, char *argv[])
{ 
  int  numtasks, rank, len, rc; 
  char hostname[MPI_MAX_PROCESSOR_NAME];

  int provided, claimed;
  rc = MPI_Init_thread(0, 0, MPI_THREAD_MULTIPLE, &provided);
  MPI_Query_thread( &claimed );
  
  if (rc != MPI_SUCCESS || provided != 3)
    {
      printf ("Error starting MPI program. Terminating.\n");
      MPI_Abort(MPI_COMM_WORLD, rc);
    }
  
  MPI_Comm_rank(MPI_COMM_WORLD,&rank);

  MPI_Comm parentcomm;
  MPI_Comm_get_parent(&parentcomm);

  /* create a second thread to listen when we have to kill the program */
  pthread_t properlyKill;
  if(pthread_create(&properlyKill, NULL, finilizeMPICom, (void *) &parentcomm))
    {     
      fprintf(stderr, "Error creating thread\n");
      return 0;
    }
  
  assert(parentcomm != MPI_COMM_NULL);

  MPI_Status status;
  int root_process, ierr, num_rows_to_receive;

  int mode;
  MPI_Recv( &mode, 1, MPI_INT, 0, send_data_tag, parentcomm, &status);
  printf("c The solver works in the mode %d\n", mode);

  printf("I sent a message %d\n", rank);

  // if(rank != 1) sleep(100);
  
  int res = 1;
  MPI_Send(&res, 1, MPI_INT, 0, send_data_tag, parentcomm);  
  printf("we want to listen for somethiing %d\n", rank);  
  
  int rescc = 1;
  MPI_Recv(&rescc, 1, MPI_INT, 0, send_data_tag, parentcomm, &status);
  printf("I received the message %d %d\n", rescc, rank);
  
  if(rescc == 1000)
    {
      printf("~~~~~~~~>>> I print the solution %d\n", rank);
      int res3 = 1001;
      MPI_Send(&res3, 1, MPI_INT, 0, send_data_tag, parentcomm);
    }
  else printf("I do not understand %d\n", rank);

  printf("I wait the thread to kill the programm %d\n", rank);
  pthread_join(properlyKill, (void**)&(res));
  return 0;
}

For the master I have:

int main(int argc, char **argv)
{  
  Parser *p = new Parser("slave.xml");

  MPI_Init(&argc, &argv);
  if(p->method == "concurrent")
    {
      ConcurrentManager cc(p->instance, p->solvers);
      cc.run();
    }
  else
    {
      cerr << "c The only available methods are: concurrent, eps (Embarrassingly Parallel Search) or tree" << endl;
      exit(1);
    }

  delete(p);
  MPI_Finalize();
  exit(0);
}// main


/**
   Create a concurrent manager (means init the data structures to run
   the solvers).
   
   @param[in] _instance, the benchmark path
   @param[in] _solvers, the set of solvers that will be ran
 */
ConcurrentManager::ConcurrentManager(string _instance, vector<Solver> &_solvers) :
  instance(_instance), solvers(_solvers)
{
  cout << "c\nc Concurrent manager called" << endl;
  
  nbSolvers = _solvers.size();
  np = new int[nbSolvers];
  cmds = new char*[nbSolvers];
  arrayOfArgs = new char **[nbSolvers];
  infos = new MPI_Info[nbSolvers];

  for(int i = 0 ; i<nbSolvers ; i++)
    {
      np[i] = solvers[i].npernode;

      cmds[i] = new char[(solvers[i].executablePath).size() + 1];
      strcpy(cmds[i], (solvers[i].executablePath).c_str());      

      arrayOfArgs[i] = new char *[(solvers[i].options).size() + 1];
      for(unsigned int j = 0 ; j<(solvers[i].options).size() ; j++)
        {
          arrayOfArgs[i][j] = new char[(solvers[i].options[j]).size() + 1];
          strcpy(arrayOfArgs[i][j], (solvers[i].options[j]).c_str());          
        }
      arrayOfArgs[i][(solvers[i].options).size()] = NULL;

      MPI_Info_create(&infos[i]);

      char hostname[solvers[i].hostname.size()];
      strcpy(hostname, solvers[i].hostname.c_str());
      MPI_Info_set(infos[i], "host", hostname);
    }

  sizeComm = 0;
}// constructor


/**
   Wait that at least one process finish and return the code
   SOLUTION_FOUND.

   @param[in] intercomm, the communicator
 */
void ConcurrentManager::waitForSolution(MPI_Comm &intercomm)
{
  MPI_Status arrayStatus[sizeComm], status;
  MPI_Request request[sizeComm];
  int val[sizeComm], flag;

  for(int i = 0 ; i<sizeComm ; i++) MPI_Irecv(&val[i], 1, MPI_INT, i, TAG_MSG, intercomm, &request[i]);

  bool solutionFound = false;
  while(!solutionFound)
    {
      for(int i = 0 ; i<sizeComm ; i++)
        {
          MPI_Test(&request[i], &flag, &arrayStatus[i]);
          if(flag) 
            {
              printf("--------------------->    %d reveived %d\n", i , val[i]);
              if(val[i] == SOLUTION_FOUND)
                {
                  int msg = PRINT_SOLUTION;
                  MPI_Send(&msg, 1, MPI_INT, i, TAG_MSG, intercomm); // ask to print the solution

                  int msgJobFinished;
                  MPI_Recv(&msgJobFinished, 1, MPI_INT, i, TAG_MSG, intercomm, &status);  // wait the answer
                  assert(msgJobFinished == JOB_FINISHED);

                  cout << "I am going to kill everybody" << endl;
                  
                  int msgKill[sizeComm];
                  for(int j = 0 ; j<sizeComm ; j++)
                    {
                      msgKill[i] = STOP_AT_ONCE;
                      MPI_Send(&msgKill[i], 1, MPI_INT, j, TAG_KILL, intercomm);
                    }

                  solutionFound = true;
                  break;
                } else
                {
                  printf("restart the communication for %d\n", i);
                  MPI_Irecv(&val[i], 1, MPI_INT, i, TAG_MSG, intercomm, &request[i]);
                }
            }
        }      
    }
}// waitForSolution


/**
   Run the solver.
 */
void ConcurrentManager::run()
{
  MPI_Comm intercomm;
  int errcodes[solvers.size()];

  MPI_Comm_spawn_multiple(nbSolvers, cmds, arrayOfArgs, np, infos, 0, MPI_COMM_WORLD, &intercomm, errcodes);
  
  MPI_Comm_remote_size(intercomm, &sizeComm);
  cout << "c Solvers are now running: " << sizeComm << endl;

  int msg = CONCU_MODE;
  for(int i = 0 ; i<sizeComm ; i++) MPI_Send(&msg, 1, MPI_INT, i, TAG_MSG, intercomm); // init the working mode
  
  waitForSolution(intercomm);
}// run

I know that I put a lot of code :(

But, I do not know where is the problem.

Please, help me :)

Best regards.

JML
  • 31
  • 4
  • ' Signal: Segmentation fault (11)' - there should be a core file created. Start by looking into it. – SergeyA Mar 08 '16 at 21:05

1 Answers1

1

The MPI documentation for how MPI interacts with threads demands that the call to MPI_Finalize() be performed by the main thread -- that is, the same one that initialized MPI. In your case, that happens also to be your process's initial thread.

In order to satisfy MPI's requirements, you could reorganize your application so that the initial thread is the one that waits for a kill signal and then shuts down MPI. The other work it currently does would then need to be moved to a different thread.

John Bollinger
  • 160,171
  • 8
  • 81
  • 157