1

Some pieces of my code crash randomly when call push_back or erase on std::list, the code is in a long time runing service . I removed the unimportant part and just keep the most relevant.


std::list<RxCCPReport> pendingSync;

size_t AnchorConnection::HandleCCPTxReport(const char *rxBytes, size_t length) {
    if (!pendingSync.empty())
    {
        std::list<RxCCPReport>::iterator iter = pendingSync.begin();
        while (iter != pendingSync.end())
        {
            if (condition1)
            {
                iter = pendingSync.erase(iter);
            }
            else
            {
                if (condition2)
                {
                    ++iter;
                }
                else if (condition3)
                {
                    ++iter;
                }
                else
                {
                    iter = pendingSync.erase(iter);
                }
            }
        }
    }

    return length; }

size_t AnchorConnection::HandleCCPRxReport(const char *rxBytes, size_t length) {
    RxCCPReport rxCCP;
    rxCCP.anc = anc;
    rxCCP.rxTime = ccpRxTime;
    rxCCP.seqNum = pCsMsg->seqNum;
    pendingSync.push_back(rxCCP);
    return length; 
}

The two function runs on the same thread; code will crash at pendingSync.erase(iter) or pendingSync.push_back(rxCCP) sometimes, about 1 time a week.

the backtrace:

  1. crashed at erase
#0  std::__detail::_List_node_base::_M_unhook (this=0x7fffcc024040) at ../../../../../libstdc++-v3/src/c++98/list.cc:142
        __next_node = 0x0
        __prev_node = 0x7fffcc024040
#1  0x00000000004db14a in std::list<RxCCPReport, std::allocator<RxCCPReport> >::_M_erase (this=0x7fffdc011970, __position=...) at /usr/include/c++/4.8.2/bits/stl_list.h:1570
        __n = 0x7fffe5ffad80
#2  0x00000000004daddb in std::list<RxCCPReport, std::allocator<RxCCPReport> >::erase (this=0x7fffdc011970, __position=...) at /usr/include/c++/4.8.2/bits/list.tcc:112
        __ret = <error reading variable __ret (Cannot access memory at address 0x10)>
#3  0x00000000004e3d8b in AnchorConnection::HandleCCPTxReport (this=0x7fffdc009240, rxBytes=0x7fffc4022abb "0rl\206%M/\004\205O\263\002\216\356\003\002f", length=12)
    at LE/Network/rtls_anchor_conn.cpp:731
        rxCCP = {rxTime = 3.1943677384753104, seqNum = 114 'r', anc = }
        cSeqRxCCP = 114 'r'
        cs = 0x7fffdc016430
        ccpTx = 203157767788
        ccpTxTime = 3.1794272052158457
        pCsMsg = 0x7fffc4022abb
        cSeqTxCCP = 114 'r'
  1. crashed at push_back another time
> #0  0x00007ffff6dff387 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:55
        resultvar = 0
        pid = 5068
        selftid = 5418
#1  0x00007ffff6e00a78 in __GI_abort () at abort.c:90
        save_stage = 2
        act = {__sigaction_handler = {sa_handler = 0x7ffff6ddb76d, sa_sigaction = 0x7ffff6ddb76d}, sa_mask = {__val = {5, 140737336646961, 3, 140737043728478, 2, 
              140737336643303, 1, 140737336652017, 3, 140737043728452, 12, 140737336652021, 2, 140737043729264, 140737043729264, 140737043731024}}, sa_flags = 16, 
          sa_restorer = 0x7fffe57f8d70}
        sigs = {__val = {32, 0 <repeats 15 times>}}
#2  0x00007ffff6e41ed7 in __libc_message (do_abort=do_abort@entry=2, fmt=fmt@entry=0x7ffff6f54350 "*** Error in `%s': %s: 0x%s ***\n")
    at ../sysdeps/unix/sysv/linux/libc_fatal.c:196
        ap = {{gp_offset = 40, fp_offset = 48, overflow_arg_area = 0x7fffe57f9260, reg_save_area = 0x7fffe57f9170}}
        ap_copy = {{gp_offset = 16, fp_offset = 48, overflow_arg_area = 0x7fffe57f9260, reg_save_area = 0x7fffe57f9170}}
        fd = 26
        on_2 = <optimized out>
        list = <optimized out>
        nlist = <optimized out>
        cp = <optimized out>
        written = <optimized out>
#3  0x00007ffff6e483e4 in malloc_printerr (action=<optimized out>, str=0x7ffff6f54548 "malloc(): memory corruption (fast)", ptr=<optimized out>, ar_ptr=<optimized out>)
    at malloc.c:4967
        buf = "00007fffd8001570"
        cp = <optimized out>
        ar_ptr = <optimized out>
        ptr = <optimized out>
        str = 0x7ffff6f54548 "malloc(): memory corruption (fast)"
        action = <optimized out>
#4  0x00007ffff6e4bb20 in _int_malloc (av=av@entry=0x7fffd0000020, bytes=bytes@entry=40) at malloc.c:3383
        p = 0x7fffd8001560
        fb = <optimized out>
        pp = <optimized out>
        nb = 48
        idx = <optimized out>
        bin = <optimized out>
        victim = 0x7fffd8001560
        size = <optimized out>
        victim_index = <optimized out>
        remainder = <optimized out>
        remainder_size = <optimized out>
        block = <optimized out>
        bit = <optimized out>
        map = <optimized out>
        fwd = <optimized out>
        bck = <optimized out>
        errstr = <optimized out>
#5  0x00007ffff6e4e6fc in __GI___libc_malloc (bytes=40) at malloc.c:2905
        ar_ptr = 0x7fffd0000020
        victim = <optimized out>
        hook = <optimized out>
#6  0x00007ffff7929ecd in operator new (sz=40) at ../../../../libstdc++-v3/libsupc++/new_op.cc:51
        p = <optimized out>
#7  0x00000000004dc6f8 in __gnu_cxx::new_allocator<std::_List_node<RxCCPReport> >::allocate (this=0x7fffdc011930, __n=1) at /usr/include/c++/4.8.2/ext/new_allocator.h:104
No locals.
#8  0x00000000004dc006 in std::_List_base<RxCCPReport, std::allocator<RxCCPReport> >::_M_get_node (this=0x7fffdc011930) at /usr/include/c++/4.8.2/bits/stl_list.h:334
No locals.
#9  0x00000000004db699 in std::list<RxCCPReport, std::allocator<RxCCPReport> >::_M_create_node<RxCCPReport const&> (this=0x7fffdc011930)
    at /usr/include/c++/4.8.2/bits/stl_list.h:502
        __p = 0x0
#10 0x00000000004db1c2 in std::list<RxCCPReport, std::allocator<RxCCPReport> >::_M_insert<RxCCPReport const&> (this=0x7fffdc011930, __position=...)
    at /usr/include/c++/4.8.2/bits/stl_list.h:1561
        __tmp = 0x0
#11 0x00000000004dae68 in std::list<RxCCPReport, std::allocator<RxCCPReport> >::push_back (this=0x7fffdc011930, __x=...) at /usr/include/c++/4.8.2/bits/stl_list.h:1016
No locals.
#12 0x00000000004e475b in AnchorConnection::HandleCCPRxReport (this=0x7fffdc01be90, rxBytes=0x7fffd004d7fb "1{\212\211D\021\006\061+\b\250\256i\227@ԺF(ú\002\005\003o\200@", 
    length=88) at LE/Network/rtls_anchor_conn.cpp:936
        rxCCP = {rxTime = 4.3416058279497198, seqNum = 123 '{', anc = 0x7fffdc016320}
        prevSeqNum = 122 'z'
        master = 0x7fffdc0101a0
        cs = 0x7fffdc016450
        masterID = 588618078431250826
        logNum = 45794088
        pCsMsg = 0x7fffe57f9500
        ccpRx = 277418192552
        ccpRxTime = 4.3416058279497198
        txTime = "0000000000\000\334\377\177\000\000\177\236", <incomplete sequence \345>
        temp = "\000\000\000\000"
        diagMsg2 = "0F000005036F8040001000000412C8A607120000086000BC07390AD806130000040000FC0114000005F9FF0700731500000EA8AE699740D4BA740800EC69974025000010A604C3FB4C07A3F7520743FE260538072E100002EF022700280491FE1F6D2E100"...
        csmsg = {type = 49 '1', seqNum = 123 '{', masterID = "\212\211D\021\006\061+\b", csRxTime = "\250\256i\227@", {firstPath16 = 47828, firstPath = "Ժ"}, 
          extLen = 70 'F', {
            ext = "(ú\002\005\003o\200@\000\022Ȧ\a`\000\274\a9\n\330\006\000\000\374\001\371\377\a\000s\250\256i\227@Ժt\b\000\354i\227@\000\246\004\303\373L\a\243\367R\aC\376&\005\070\a\357\002\221\376\037m?\021", {{logNum32 = 45794088, logNum = "(ú\002"}, 
              diagnostics = "\005\003o\200@\000\022Ȧ\a`\000\274\a9\n\330\006\000\000\374\001\371\377\a\000s\250\256i\227@Ժt\b\000\354i\227@\000\246\004\303\373L\a\243\367R\aC\376&\005\070\a\357\002\221\376\037m?\021"}}}
        FP = 747
        rxTime = "409769AEA8\000\345\377\177\000\000Н", <incomplete sequence \345>

I was nearly mad, any suggestion appreciated

Code that receive data over socket is as follows:

#define FRAME_HEADER_LEN (6)
#define FRAME_DATA_IDX   (3)

virtual void Run()
{
    unsigned int length = 0;
    std::string rxBytes;
    bool msgProcess;
    uint8_t fcode;
    unsigned int flen = 0;

    while (shutdown_select_anc(socket) != -1)
    {
        int count = socket->ReceiveData();

        if (count <= 0) // Something failed. Socket is probably closed.
        {
            break;
        }

        length += count;
        rxBytes.append(socket->buf_, count);

        /*
        * the data coming from the anchor is framed:
        *<STX><LENlsb><LENmsb><DATA:<FC><XX>.....><CRClsb><CRCmsb><ETX>
        * STX = 0x2
        * LEN is the length of data message(16 bits)
        * CRC is the 16 - bit CRC of the data bytes
        * ETX = 0x3
        * FC = is the function code (API code)
        */
        if (length <= FRAME_HEADER_LEN) //minimum length is 7 bytes (header + 1 byte function code)
        {
            continue;
        }

        do {
            msgProcess = false;
            crc_err_t pckt_crc = CRC_ERROR;
            //search through the received data to find the start of frame
            while (length > FRAME_HEADER_LEN)
            {
                if ((rxBytes.c_str())[0] == 0x2) //this is the start - check if length and end match
                {
                    uint8_t hi = (rxBytes.c_str())[2];
                    uint8_t lo = (rxBytes.c_str())[1];

                    flen = lo + ((uint16_t)hi << 8);
                    
                    if ((flen + FRAME_HEADER_LEN) > length) //we don't have all the frame yet
                    {
                        break;
                    }

                    if ((rxBytes.c_str())[flen + 5] == 0x3) //end of frame present
                    {
                        //received whole frame: check frame checksum here
                        pckt_crc = check_crc16((uint8_t *)&(rxBytes.c_str())[3], (flen + 2));
                        break;
                    }
                    else //no end of frame in the expected place - start byte was not real start
                    {
                        rxBytes.erase(0, 1);
                        length -= 1;
                    }
                }
                else //remove the byte and check if next is a start of frame
                {
                    rxBytes.erase(0, 1);
                    length -= 1;
                }
            }

            //check if we have a frame
            if (((flen + FRAME_HEADER_LEN) > length) || (length <= FRAME_HEADER_LEN))
            {
                break;
            }

            if (pckt_crc != CRC_OKAY)
            {
                //frame packet fully received but it's CRC bad
                DBG_PRINTF(DBG_DEFAULT, "ACHTUNG ACHTUNG : incompatible software without CRC or network CRC error\n");
                //let incompatible software proceed at the moment
            }
            
            fcode = (rxBytes.c_str())[FRAME_DATA_IDX]; //get function code 

            typename std::map<uint8_t, std::pair<size_t, handler> >::iterator it = handlers.find(fcode);
            if (it != handlers.end()) //we have found a handler for this function code
            {
                if (flen >= it->second.first)
                {
                    handler f = it->second.second;
                    size_t n = (static_cast<Child*>(this)->*f)(&((rxBytes.c_str())[FRAME_DATA_IDX]), flen );
                    if (n > 0)
                    {
                        msgProcess = true;
                        rxBytes.erase(0, n + FRAME_HEADER_LEN);
                        length -= (unsigned int)(n + FRAME_HEADER_LEN);
                    }
                    else
                    {
                        //Unexpected Message
                        rxBytes.clear();
                        length = 0;
                    }
                }
            }
            else
            {
                rxBytes.clear();
                length = 0;
            }
        } while((length > 0) && msgProcess);
    }

    socket->Close();
}
zhuo.quan
  • 63
  • 7
  • 1
    I'd recommend compiling with `-fsanitize=address`. Ideally, the moment you have a bad memory access, it will stop and tell you. – chris Oct 13 '20 at 08:40
  • 2
    You can turn on a few more `-fsanitize=undefined -fsanitize=address -fsanitize=leak -fstack-check`. Note: `-lasan -lubsan` must be first in the list of libraries that you link with. – Ted Lyngmo Oct 13 '20 at 08:47
  • In `HandleCCPTxReport(const char *rxBytes, size_t length)` you don't use `rxBytes` at all. That looks suspicious. Also, try to avoid sending data and the size of the data as separate entities. A `const std::vector&` or `const std::string&` would probably be better. – Ted Lyngmo Oct 13 '20 at 08:51
  • Btw, `"0rl\206%M/\004\205O\263\002\216\356\003\002f"` looks like 17 `char`s, not 12 as `length` indicates in your crash log. – Ted Lyngmo Oct 13 '20 at 08:58
  • Is the data you handle received from sockets or pipes? If so, mind showing us the `read`ing part? – Ted Lyngmo Oct 13 '20 at 09:52
  • @TedLyngmo this is binary data not null terminated string and crash reporter doesn't know that so it prints more data then it is needed. – Marek R Oct 13 '20 at 09:52
  • @MarekR Could be. In debug mode I think it has a pretty good understanding of where a buffer ends though - but I'm not sure. I'd like to see how the reading is done. – Ted Lyngmo Oct 13 '20 at 09:54
  • @TedLyngmo I have posted the code that receive data over socket. Could you please help to check, Thank you. – zhuo.quan Oct 14 '20 at 03:39
  • @zhuo.quan Good. It looks fairly complicated. Why are you doing `(rxBytes.c_str())[0]` instead of `rxBytes[0]` ? Why are you using checksums? Aren't you using TCP? If you get out of sync (for some reason) it seems like you'll be missing frames. Why not remove the frame information and send a `std::string` with the data to the handler? Some of the buffer handling seems spread out (in the socket class and in this class so it's hard to get an overview. I'd try to make the responsibilities for each class clearer. Instead of magic numbers, use proper constants (not `define`s). – Ted Lyngmo Oct 14 '20 at 06:00

1 Answers1

1

In the push_back case you have

str = 0x7ffff6f54548 "malloc(): memory corruption (fast)"

Suggests that you have an memory corruption elsewhere, most likely shortly before calling HandleCCPRxReport.

Surt
  • 15,501
  • 3
  • 23
  • 39
  • ... and `length=88` which doesn't seem to match the data `"1{\212\211D\021\006\061+\b\250\256i\227@ԺF(ú\002\005\003o\200@"`. – Ted Lyngmo Oct 13 '20 at 09:24
  • @TedLyngmo its at the allocation of the linked list node itself it fails if I see it correctly. Aren't there 88 chars there (assuming those \006 are 4 chars). And the rxBytes are not used. – Surt Oct 13 '20 at 09:52
  • @TedLyngmo again this is binary data not null terminated string and tool assumes it is null terminated string. – Marek R Oct 13 '20 at 09:54
  • @Surt The `\006` values are usually octal values, so I'd guess that `\006` is one byte. – Ted Lyngmo Oct 13 '20 at 09:58
  • @MarekR I think that depends on the tool and it's displaying the buffer as binary data (including `\000`) so to me it doesn't look like it's printing up to a null terminator but rather according to a length. – Ted Lyngmo Oct 13 '20 at 10:06