3

I have two daemons, and A is speaking to B. B is listening on a port, and A opens a tcp connection to that port. A is able to open a socket to B, but when it attempts to actually write said socket, I get a SIGPIPE, so I'm trying to figure out where B could be closing the open socket.

However, if I attach to both daemons in gdb, the SIGPIPE happens before any of the code for handling data is called. This kind of makes sense, because the initial write is never successful, and the listeners are triggered from receiving data. My question is - what could cause daemon B to close the socket before any data is sent? The socket is closed less than a microsecond after opening it, so I'm thinking it can't be a timeout or anything of the sort. I would love a laundry list of possibilities to track down, as I've been chewing on this one for a few days and I'm pretty much out of ideas.

As requested, here is the code that accepts and handles communication:

{
extern char *PAddrToString(pbs_net_t *);

int i;
int n;

time_t now;

fd_set *SelectSet = NULL;
int SelectSetSize = 0;

int MaxNumDescriptors = 0;

char id[] = "wait_request";
char tmpLine[1024];

struct timeval timeout;

long OrigState = 0;

if (SState != NULL)
  OrigState = *SState;

timeout.tv_usec = 0;

timeout.tv_sec  = waittime;

SelectSetSize = sizeof(char) * get_fdset_size();
SelectSet = (fd_set *)calloc(1,SelectSetSize);

pthread_mutex_lock(global_sock_read_mutex);

memcpy(SelectSet,GlobalSocketReadSet,SelectSetSize);

/* selset = readset;*/  /* readset is global */
MaxNumDescriptors = get_max_num_descriptors();

pthread_mutex_unlock(global_sock_read_mutex);
n = select(MaxNumDescriptors, SelectSet, (fd_set *)0, (fd_set *)0, &timeout);

if (n == -1)
  {
  if (errno == EINTR)
    {
    n = 0; /* interrupted, cycle around */
    }
  else
    {
    int i;

    struct stat fbuf;

    /* check all file descriptors to verify they are valid */

    /* NOTE: selset may be modified by failed select() */

    for (i = 0; i < MaxNumDescriptors; i++)
      {
      if (FD_ISSET(i, GlobalSocketReadSet) == 0)
        continue;

      if (fstat(i, &fbuf) == 0)
        continue;

      /* clean up SdList and bad sd... */

      pthread_mutex_lock(global_sock_read_mutex);
      FD_CLR(i, GlobalSocketReadSet);
      pthread_mutex_unlock(global_sock_read_mutex);
      } /* END for each socket in global read set */

    free(SelectSet);

    log_err(errno, id, "Unable to select sockets to read requests");


    return(-1);
    }  /* END else (errno == EINTR) */
  }    /* END if (n == -1) */

for (i = 0; (i < max_connection) && (n != 0); i++)
  {
  pthread_mutex_lock(svr_conn[i].cn_mutex);

  if (FD_ISSET(i, SelectSet))
    {
    /* this socket has data */
    n--;

    svr_conn[i].cn_lasttime = time(NULL);

    if (svr_conn[i].cn_active != Idle)
      {
      void *(*func)(void *) = svr_conn[i].cn_func;

      netcounter_incr();

      pthread_mutex_unlock(svr_conn[i].cn_mutex);

      func((void *)&i);

      /* NOTE:  breakout if state changed (probably received shutdown request) */

      if ((SState != NULL) &&
          (OrigState != *SState))
        break;
      }
    else
      {

      pthread_mutex_lock(global_sock_read_mutex);
      FD_CLR(i, GlobalSocketReadSet);
      pthread_mutex_unlock(global_sock_read_mutex);

      close_conn(i, TRUE);

      pthread_mutex_unlock(svr_conn[i].cn_mutex);
      pthread_mutex_lock(num_connections_mutex);

      sprintf(tmpLine, "closed connections to fd %d - num_connections=%d (select bad socket)",
        i,
        num_connections);

      pthread_mutex_unlock(num_connections_mutex);
      log_err(-1, id, tmpLine);
      }
    }
  else
    pthread_mutex_unlock(svr_conn[i].cn_mutex);
  } /* END for i */

/* NOTE:  break out if shutdown request received */

if ((SState != NULL) && (OrigState != *SState))
  return(0);

/* have any connections timed out ?? */
now = time((time_t *)0);

for (i = 0;i < max_connection;i++)
  {
  struct connection *cp;

  pthread_mutex_lock(svr_conn[i].cn_mutex);

  cp = &svr_conn[i];

  if (cp->cn_active != FromClientDIS)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue;
    }

  if ((now - cp->cn_lasttime) <= PBS_NET_MAXCONNECTIDLE)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue;
    }

  if (cp->cn_authen & PBS_NET_CONN_NOTIMEOUT)
    {
    pthread_mutex_unlock(svr_conn[i].cn_mutex);

    continue; /* do not time-out this connection */
    }

  /* NOTE:  add info about node associated with connection - NYI */

  snprintf(tmpLine, sizeof(tmpLine), "connection %d to host %s has timed out after %d seconds - closing stale connection\n",
    i,
    PAddrToString(&cp->cn_addr),
    PBS_NET_MAXCONNECTIDLE);

  log_err(-1, "wait_request", tmpLine);

  /* locate node associated with interface, mark node as down until node responds */
  /* NYI */
  close_conn(i, TRUE);

  pthread_mutex_unlock(svr_conn[i].cn_mutex);
  }  /* END for (i) */

return(0);
}

NOTE: I didn't write this code.

dbeer
  • 6,963
  • 3
  • 31
  • 47
  • 1
    It might be interesting to see what "events" `poll()` says are on the socket in the `A` program just before and after the write attempt. – aschepler Nov 07 '11 at 18:39
  • Thanks for the idea, I will track it down if I don't find the problem looking at the first answer. – dbeer Nov 07 '11 at 18:45

1 Answers1

2

Is it possible you messed up and somewhere else in the program you try to close the same handle twice?

That could do this to you very easily.

HINT: systrace can determine if this is happening.

Joshua
  • 40,822
  • 8
  • 72
  • 132
  • This is quite possibly my problem. I will investigate. – dbeer Nov 07 '11 at 18:44
  • Could this happen if I down a shutdown(sock, 2); and then a close(sock); ? – dbeer Nov 07 '11 at 18:45
  • No, and even if it were that's not very likely to be the problem if close() immediately follows shutdown(). – Joshua Nov 07 '11 at 18:51
  • @dbeer Well, if the value of `sock` is somehow not the socket you want to close, then sure. But better show us the code of B that accepts and handles a client. – nos Nov 07 '11 at 18:59
  • @Joshua - in a debugger I'm certain that no socket is closed in the time since I open the socket and attempt to write it. Could this still be a cause of the problem or should I move on and look elsewhere? – dbeer Nov 07 '11 at 22:35
  • 1
    If you verified nobody called close() on any file (not just socket) then move on. – Joshua Nov 08 '11 at 03:01
  • Accepting because we cleaned up the close() calls some more and it appears to have gone away – dbeer Dec 07 '11 at 22:30