1

I'm working on custom dahdi driver for hardware phone module. This module connected to pc via usb (userspace libusb D2XX ftdi driver). So driver was splitted into 2 parts: kernel mode dahdi driver and userspace service.

Userspace daemon detects usb cards and ask to kernel dahdi driver to add/remove dahdi spans using netlink. After adding new span it communicate device, send/receive audio and commands.

I successfully tested it via single phone card (hotplug device, dial, speak, hotremove without any bugs), started multiple card tests and got "BUG: scheduling while atomic" at netlink_broadcast() call:

[ 5322.363190]  [<ffffffffae709113>] __schedule_bug+0x64/0x72
[ 5322.363196]  [<ffffffffae713fdb>] __schedule+0x9fb/0xa20
[ 5322.363202]  [<ffffffffae0cc1d6>] __cond_resched+0x26/0x30
[ 5322.363205]  [<ffffffffae7142ca>] _cond_resched+0x3a/0x50
[ 5322.363209]  [<ffffffffae1f7a35>] kmem_cache_alloc+0x35/0x1f0
[ 5322.363216]  [<ffffffffae5d7ad9>] ? skb_clone+0x49/0xb0
[ 5322.363218]  [<ffffffffae5d7ad9>] skb_clone+0x49/0xb0
[ 5322.363225]  [<ffffffffae623ca1>] netlink_broadcast_filtered+0x331/0x3e0
[ 5322.363227]  [<ffffffffae623d6d>] netlink_broadcast+0x1d/0x20
[ 5322.363231]  [<ffffffffc0c113ca>] nl_send_cmd+0x15a/0x260 [mydev]
[ 5322.363234]  [<ffffffffc0c11531>] mydev_hooksig+0x61/0x80 [mydev]
[ 5322.363240]  [<ffffffffc082ecdb>] dahdi_rbs_sethook+0x9b/0x220 [dahdi]
[ 5322.363244]  [<ffffffffc0833566>] _dahdi_transmit+0x4c6/0x5b0 [dahdi]
[ 5322.363248]  [<ffffffffc08368f5>] ? _dahdi_receive+0x235/0x3a0 [dahdi]
[ 5322.363250]  [<ffffffffc0c10436>] ? queue_write+0x66/0xd0 [mydev]
[ 5322.363252]  [<ffffffffc0c10963>] mydev_tick+0x1e3/0x2b0 [mydev]
[ 5322.363256]  [<ffffffffc08365fe>] _process_masterspan+0x5be/0x680 [dahdi]
[ 5322.363259]  [<ffffffffc0836a1c>] _dahdi_receive+0x35c/0x3a0 [dahdi]
[ 5322.363263]  [<ffffffffc07b28b7>] g4_interrupt+0x3b7/0xc7b [opvxg4xx]
[ 5322.363266]  [<ffffffffc08332da>] ? _dahdi_transmit+0x23a/0x5b0 [dahdi]
[ 5322.363270]  [<ffffffffc08368f5>] ? _dahdi_receive+0x235/0x3a0 [dahdi]
[ 5322.363274]  [<ffffffffae141284>] __handle_irq_event_percpu+0x44/0x1c0
[ 5322.363276]  [<ffffffffae141432>] handle_irq_event_percpu+0x32/0x80
[ 5322.363277]  [<ffffffffae1414bc>] handle_irq_event+0x3c/0x60
[ 5322.363281]  [<ffffffffae144ab9>] handle_fasteoi_irq+0x59/0x110
[ 5322.363285]  [<ffffffffae02d504>] handle_irq+0xe4/0x1a0
[ 5322.363290]  [<ffffffffae1029fc>] ? tick_check_idle+0x8c/0xd0

The kernel driver structure is very simple. It register dahdi device with list of calbacks:

static const struct dahdi_span_ops mydev_span_ops = {
   .owner = THIS_MODULE,
   .hooksig = mydev_hooksig,
   .spanconfig = mydev_spanconfig,
   .chanconfig = mydev_chanconfig,
   .startup = mydev_startup,
   .shutdown = mydev_shutdown,
   .open = mydev_open,
   .close = mydev_close,
   .ioctl = mydev_ioctl,
   .sync_tick = mydev_tick,
};

The hooksig operation is called from dahdi module on handoff/hangon and it just send netlink message to userspace daemon:

static int mydev_hooksig(struct dahdi_chan *chan, enum dahdi_txsig txsig)
{
   struct mydevp *wc = NULL;
   int reg=0;

   if (chan == NULL) return 0;
   wc = chan->pvt;

   switch(txsig) {
   case DAHDI_TXSIG_START:
   case DAHDI_TXSIG_OFFHOOK:
      nl_send_cmd(chan->chanpos, wc->serial, "off", OP_OFFHOOK);
      break;

   case DAHDI_TXSIG_ONHOOK:
      nl_send_cmd(chan->chanpos, wc->serial, "on", OP_ONHOOK);
      break;

   default:
      printk(KERN_NOTICE "dahdi_mydev: Can't set tx state to unknown %d\n", txsig);
   }

   printk(KERN_DEBUG "dahdi_mydev: Setting hook state to %d (%02x)\n", txsig, reg);
   return 0;
}

The dahdi_rbs_sethook()->mydev_hooksig() function called at several places inside dahdi module and always wrapped with spin_lock_irqsave()/spin_unlock_irqrestore(), but I don't understand why problem is disappeared when only one card connected.

So I just can't figure out how to send netlink message correctly from this callback. Is there any non-sleeping variation of netlink_broadcast ?

add1: Tried to spin_unlock send spin_lock - didn't help.

add2: This is the code of my nl_send_cmd() function

int nl_send_cmd(int chan, char *serial, char *dial, int op) {
   int rc = 0;
   struct sk_buff *skb;
   void *msg_head;

   pr_debug("dahdi_mydev: trying to sent dial string to userspace\n");
   skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
   if (!skb) {
      pr_err("dahdi_mydev: genlmsg_new() failed.\n");
      return -ENOMEM;
   }

   msg_head = genlmsg_put(skb, 0, 0, &span_gnl_family, 0, SPAN_DIAL_CMD);
   if (!msg_head) {
      pr_err("dahdi_mydev: genlmsg_put() failed.\n");
      kfree_skb(skb);
      return -ENOMEM;
   }

   rc = nla_put_string(skb, ATTR1_STRING, serial);
   if (rc) {
      pr_err("dahdi_mydev: nla_put_string() failed for serial: %d\n", rc);
      kfree_skb(skb);
      return -ENOMEM;
   }

   rc = nla_put_string(skb, ATTR4_STRING, dial);
   if (rc) {
      pr_err("dahdi_mydev: nla_put_string() failed for dial: %d\n", rc);
      kfree_skb(skb);
      return -ENOMEM;
   }

   rc = nla_put_u32(skb, ATTR2_SINT32, chan);
   if (rc) {
      pr_err("dahdi_mydev: nla_put_sint32() failed for dial: %d\n", rc);
      kfree_skb(skb);
      return -ENOMEM;
   }

   rc = nla_put_u32(skb, ATTR5_SINT32, op);
   if (rc) {
      pr_err("dahdi_mydev: nla_put_sint32() failed for dial: %d\n", rc);
      kfree_skb(skb);
      return -ENOMEM;
   }

   genlmsg_end(skb, msg_head);
   rc = genlmsg_multicast(&span_gnl_family, skb, 0, 0, GFP_KERNEL);
   if (rc) {
      pr_info("dahdi_mydev: Dial message didn't sent - no listeners ?\n");
      return -ENOTCONN;
   }

   pr_debug("dahdi_mydev: NL msg for %s sent with '%s' op %d\n", serial, dial, op);
   return 0;
}
Alexandr Shutko
  • 1,857
  • 2
  • 20
  • 27
  • In your answer you describe fixing the code which you don't show us. Please, add corresponded code to the question post. Or simply delete the question, as resolving "BUG: scheduling while atomic" with known backtrace is just a common debug procedure. – Tsyvarev Jul 22 '19 at 09:23
  • Added code. The place of bug was known from trace. The question was how to send netlink message inside atomic function. – Alexandr Shutko Jul 23 '19 at 00:08

1 Answers1

1

Ok. I spend some time and found an answer. The solution is to replace GFP_KERNEL by GFP_ATOMIC in genlmsg_new() and genlmsg_multicast() calls because resulted netlink_broadcast() function have this inside:

if (info.delivered) {
    if (info.congested && gfpflags_allow_blocking(allocation))
        yield();
    return 0;
}

I think in single usb card mode congested flag was false and there was no sleeping at all. After connecting another cards there was alot of netlink messages -> congested flag rised + allocation flags = GFP_KERNEL -> sleep inside locked section.

Alexandr Shutko
  • 1,857
  • 2
  • 20
  • 27