4

I'm porting an application built on top of the ACE Proactor framework. The application runs perfectly for both VxWorks and Windows, but fails to do so on Linux (CentOS 5.5, WindRiver Linux 1.4 & 3.0) with kernel 2.6.X.X - using librt.

I've narrowed the problem down to a very basic issue: The application begins an asynchronous (via aio_read) read operation on a socket and subsequently begins an asynchronous (via aio_write) write on the very same socket. The read operation cannot be fulfilled yet since the protocol is initialized from the application's end. - When the socket is in blocking-mode, the write is never reached and the protocol "hangs". - When using a O_NONBLOCK socket, the write succeeds but the read returns indefinitely with a "EWOULDBLOCK/EAGAIN" error, never to recover (even if the AIO operation is restarted).

I went through multiple forums and could not find a definitive answer to whether this should work (and I'm doing something wrong) or impossible with Linux AIO. Is it possible if I drop the AIO and seek a different implementation (via epoll/poll/select etc.)?

Attached is a sample code to quickly re-produce the problem on a non-blocking socket:

#include <aio.h>
#include <stdio.h>
#include <stdlib.h>
#include <netdb.h>
#include <string.h> 
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <assert.h>
#include <errno.h>

#define BUFSIZE (100)

// Global variables
struct aiocb *cblist[2];
int theSocket;

void InitializeAiocbData(struct aiocb* pAiocb, char* pBuffer)
{
    bzero( (char *)pAiocb, sizeof(struct aiocb) );

    pAiocb->aio_fildes = theSocket;
    pAiocb->aio_nbytes = BUFSIZE;
    pAiocb->aio_offset = 0;
    pAiocb->aio_buf = pBuffer;
}

void IssueReadOperation(struct aiocb* pAiocb, char* pBuffer)
{
    InitializeAiocbData(pAiocb, pBuffer);

    int ret = aio_read( pAiocb );
    assert (ret >= 0);
}

void IssueWriteOperation(struct aiocb* pAiocb, char* pBuffer)
{
    InitializeAiocbData(pAiocb, pBuffer);

    int ret = aio_write( pAiocb );
    assert (ret >= 0);
}

int main()
{
    int ret;
    int nPort = 11111;
    char* szServer = "10.10.9.123";

    // Connect to the remote server
    theSocket = socket(AF_INET, SOCK_STREAM, 0);
    assert (theSocket >= 0);

    struct hostent *pServer;
    struct sockaddr_in serv_addr;
    pServer = gethostbyname(szServer);

    bzero((char *) &serv_addr, sizeof(serv_addr));
    serv_addr.sin_family = AF_INET;
    serv_addr.sin_port = htons(nPort);
    bcopy((char *)pServer->h_addr, (char *)&serv_addr.sin_addr.s_addr, pServer->h_length);

    assert (connect(theSocket, (const sockaddr*)(&serv_addr), sizeof(serv_addr)) >= 0);

    // Set the socket to be non-blocking
    int oldFlags = fcntl(theSocket, F_GETFL) ;
    int newFlags = oldFlags | O_NONBLOCK;

    fcntl(theSocket, F_SETFL, newFlags);
    printf("Socket flags: before=%o, after=%o\n", oldFlags, newFlags);

    // Construct the AIO callbacks array
    struct aiocb my_aiocb1, my_aiocb2;
    char* pBuffer = new char[BUFSIZE+1];

    bzero( (char *)cblist, sizeof(cblist) );
    cblist[0] = &my_aiocb1;
    cblist[1] = &my_aiocb2;

    // Start the read and write operations on the same socket
    IssueReadOperation(&my_aiocb1, pBuffer);
    IssueWriteOperation(&my_aiocb2, pBuffer);

    // Wait for I/O completion on both operations
    int nRound = 1;
    printf("\naio_suspend round #%d:\n", nRound++);
    ret = aio_suspend( cblist, 2, NULL );
    assert (ret == 0);

    // Check the error status for the read and write operations
    ret = aio_error(&my_aiocb1);
    assert (ret == EWOULDBLOCK);

    // Get the return code for the read
    {
        ssize_t retcode = aio_return(&my_aiocb1);
        printf("First read operation results: aio_error=%d, aio_return=%d  -  That's the first EWOULDBLOCK\n", ret, retcode);
    }

    ret = aio_error(&my_aiocb2);
    assert (ret == EINPROGRESS);
    printf("Write operation is still \"in progress\"\n");

    // Re-issue the read operation
    IssueReadOperation(&my_aiocb1, pBuffer);

    // Wait for I/O completion on both operations
    printf("\naio_suspend round #%d:\n", nRound++);
    ret = aio_suspend( cblist, 2, NULL );
    assert (ret == 0);

    // Check the error status for the read and write operations for the second time
    ret = aio_error(&my_aiocb1);
    assert (ret == EINPROGRESS);
    printf("Second read operation request is suddenly marked as \"in progress\"\n");

    ret = aio_error(&my_aiocb2);
    assert (ret == 0);

    // Get the return code for the write
    {
        ssize_t retcode = aio_return(&my_aiocb2);
        printf("Write operation has completed with results: aio_error=%d, aio_return=%d\n", ret, retcode);
    }

    // Now try waiting for the read operation to complete - it'll just busy-wait, receiving "EWOULDBLOCK" indefinitely
    do
    {
        printf("\naio_suspend round #%d:\n", nRound++);
        ret = aio_suspend( cblist, 1, NULL );
        assert (ret == 0);

        // Check the error of the read operation and re-issue if needed
        ret = aio_error(&my_aiocb1);
        if (ret == EWOULDBLOCK)
        {
            IssueReadOperation(&my_aiocb1, pBuffer);
            printf("EWOULDBLOCK again on the read operation!\n");
        }
    }
    while (ret == EWOULDBLOCK);
}

Thanks in advance, Yotam.

Yotam
  • 41
  • 1
  • 3

1 Answers1

3

Firstly, O_NONBLOCK and AIO don't mix. AIO will report the asynchronous operation complete when the corresponding read or write wouldn't have blocked - and with O_NONBLOCK, they would never block, so the aio request will always complete immediately (with aio_return() giving EWOULDBLOCK).

Secondly, don't use the same buffer for two simultaneous outstanding aio requests. The buffer should be considered completely offlimits between the time when the aio request was issued and when aio_error() tells you that it has completed.

Thirdly, AIO requests to the same file descriptor are queued, in order to give sensible results. This means that your write won't happen until the read completes - if you need to write the data first, you need to issue the AIOs in the opposite order. The following will work fine, without setting O_NONBLOCK:

struct aiocb my_aiocb1, my_aiocb2;
char pBuffer1[BUFSIZE+1], pBuffer2[BUFSIZE+1] = "Some test message";

const struct aiocb *cblist[2] = { &my_aiocb1, &my_aiocb2 };

// Start the read and write operations on the same socket
IssueWriteOperation(&my_aiocb2, pBuffer2);
IssueReadOperation(&my_aiocb1, pBuffer1);

// Wait for I/O completion on both operations
int nRound = 1;
int aio_status1, aio_status2;
do {
    printf("\naio_suspend round #%d:\n", nRound++);
    ret = aio_suspend( cblist, 2, NULL );
    assert (ret == 0);

    // Check the error status for the read and write operations
    aio_status1 = aio_error(&my_aiocb1);
    if (aio_status1 == EINPROGRESS)
        puts("aio1 still in progress.");
    else
        puts("aio1 completed.");

    aio_status2 = aio_error(&my_aiocb2);

    if (aio_status2 == EINPROGRESS)
        puts("aio2 still in progress.");
    else
        puts("aio2 completed.");
} while (aio_status1 == EINPROGRESS || aio_status2 == EINPROGRESS);

 // Get the return code for the read
ssize_t retcode;
retcode = aio_return(&my_aiocb1);
printf("First operation results: aio_error=%d, aio_return=%d\n", aio_status1, retcode);

retcode = aio_return(&my_aiocb1);
printf("Second operation results: aio_error=%d, aio_return=%d\n", aio_status1, retcode);

Alternatively, if you don't care about reads and writes being ordered with respect to each other, you can use dup() to create two file descriptors for the socket, and use one for reading and the other for writing - each will have its AIO operations queued separately.

caf
  • 233,326
  • 40
  • 323
  • 462
  • 1
    Do you have a reference for "AIO requests to the same fd are queued in order"? If that is the case, then surely a better solution would be to duplicate the handle. – Ben Voigt Jan 07 '11 at 01:38
  • 1
    @Ben Voigt: That's another alternative - use `dup()` to create two file descriptors for the socket, and use one for reading and the other for writing. – caf Jan 07 '11 at 05:52
  • First of all, I've just seen the comments and started experimenting with them - but it looks like the dup() thingy may do the trick. It seems to fix the problem but I still need to run some stress-testing. It is very likely that I owe you guys a beer! Second, I know that in the example below I used a non-blocking socket and the same buffer - it's just a dummy test program. Still, the read in the background, before the write is issued, is something that is mandated by our specific application - so there's no changing it... – Yotam Jan 09 '11 at 15:18
  • Ben Voigt - re-post your comment as an answer and I'll mark this complete or something... – Yotam Jan 09 '11 at 15:27