I have a long running linux daemon which created a 'non-blocking' socket bind on port 65445 waiting for UDP packet. It can work most of time.
Right now, I met an issue that the process becomes "D" after a while(not sure which message caused it, but it's for sure that the daemon can process such message correctly most of time, just at some random point, it failed)
At this stage, the process doesn't take any signal, so I can't kill it, dump kernel stack:
Kernel Status:
[<ffffffff80385ff3>] number.isra.2+0x2d3/0x300
[<ffffffff802e0228>] address_space_init_once+0x88/0x120
[<ffffffff802e0200>] address_space_init_once+0x60/0x120
[<ffffffff802df880>] inode_wait+0x0/0x10
[<ffffffff802df889>] inode_wait+0x9/0x10
[<ffffffff802df880>] inode_wait+0x0/0x10
[<ffffffff80275dc0>] wake_bit_function+0x0/0x30
[<ffffffff802e0abd>] iget_locked+0x11d/0x180
[<ffffffff8030e1f0>] proc_get_inode+0x10/0xf0
[<ffffffff80313555>] proc_lookup_de+0x75/0xf0
[<ffffffff802d31bc>] d_alloc_and_lookup+0x3c/0x90
[<ffffffff802deeee>] d_lookup+0x2e/0x60
[<ffffffff802d3ea6>] do_lookup+0x296/0x3a0
[<ffffffff802ddcee>] dput+0x1e/0x190
[<ffffffff802d49eb>] link_path_walk+0x12b/0x850
[<ffffffff802dddb2>] dput+0xe2/0x190
[<ffffffff802d7437>] path_openat+0xb7/0x370
[<ffffffff802b4cfe>] tlb_finish_mmu+0xe/0x50
[<ffffffff802d7824>] do_filp_open+0x44/0xb0
[<ffffffff802e2905>] alloc_fd+0x45/0x130
[<ffffffff802c8a5c>] do_sys_open+0xec/0x1d0
[<ffffffff805a8afb>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
It indicates something wrong in procfs, after further investigation, I found the net directory of this process's procfs was corrputed, I can't even do ls /proc/*pid*/net
, bash also hangs over there.
I narrow down that the process may hang at 'recvfrom' which I just can't understand as it's a non-block socket, part of my code is following:
fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
if (fd < 0) {
return ret;
}
if (setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, DEV, sizeof(DEV))
< 0) {
goto Exit;
}
rt = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (char *)&sock_buf, sizeof(sock_buf));
if (rt < 0) {
goto Exit;
}
rt = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (char *)&sock_buf, sizeof(sock_buf));
if (rt < 0) {
goto Exit;
}
val = fcntl(fd, F_GETFL, 0);
if (val < 0)
return -1;
if (val & O_NONBLOCK)
return 0;
val |= O_NONBLOCK;
fcntl(fd, F_SETFL, val);
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET, addr.sin_port = PORT;
addr.sin_addr.s_addr = INADDR_ANY;
if (bind(fd, (void *)&addr, sizeof(addr)) < 0) {
goto Exit;
}
I add this socket in epoll
struct epoll_event e;
e.events = EPOLLIN;
e.data.ptr = comm_handle;
rc = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &e);
receive packet when there is an event:
socklen_t from_len = sizeof(struct sockaddr_in);
memset(recv_buf, 0, 65000);
len = recvfrom(e->fd, recv_buf, 65000, 0, (struct sockaddr *)&from, &from_len);
if (len <= 0) {
return -1;
}