]> Pileus Git - ~andy/linux/blobdiff - drivers/net/tun.c
tuntap: multiqueue support
[~andy/linux] / drivers / net / tun.c
index bdbb526eca7b201fbd9927f14883b179ee03143a..2762c55aeb6692d5adb746b92beb0c160c19f37c 100644 (file)
@@ -109,6 +109,12 @@ struct tap_filter {
        unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
 };
 
+/* 1024 is probably a high enough limit: modern hypervisors seem to support on
+ * the order of 100-200 CPUs so this leaves us some breathing space if we want
+ * to match a queue per guest CPU.
+ */
+#define MAX_TAP_QUEUES 1024
+
 /* A tun_file connects an open character device to a tuntap netdevice. It
  * also contains all socket related strctures (except sock_fprog and tap_filter)
  * to serve as one transmit queue for tuntap device. The sock_fprog and
@@ -129,6 +135,7 @@ struct tun_file {
        struct fasync_struct *fasync;
        /* only used for fasnyc */
        unsigned int flags;
+       u16 queue_index;
 };
 
 /* Since the socket were moved to tun_file, to preserve the behavior of persist
@@ -136,7 +143,8 @@ struct tun_file {
  * file were attached to a persist device.
  */
 struct tun_struct {
-       struct tun_file __rcu   *tfile;
+       struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
+       unsigned int            numqueues;
        unsigned int            flags;
        kuid_t                  owner;
        kgid_t                  group;
@@ -157,56 +165,157 @@ struct tun_struct {
 #endif
 };
 
+/* We try to identify a flow through its rxhash first. The reason that
+ * we do not check rxq no. is becuase some cards(e.g 82599), chooses
+ * the rxq based on the txq where the last packet of the flow comes. As
+ * the userspace application move between processors, we may get a
+ * different rxq no. here. If we could not get rxhash, then we would
+ * hope the rxq no. may help here.
+ */
+static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+       struct tun_struct *tun = netdev_priv(dev);
+       u32 txq = 0;
+       u32 numqueues = 0;
+
+       rcu_read_lock();
+       numqueues = tun->numqueues;
+
+       txq = skb_get_rxhash(skb);
+       if (txq) {
+               /* use multiply and shift instead of expensive divide */
+               txq = ((u64)txq * numqueues) >> 32;
+       } else if (likely(skb_rx_queue_recorded(skb))) {
+               txq = skb_get_rx_queue(skb);
+               while (unlikely(txq >= numqueues))
+                       txq -= numqueues;
+       }
+
+       rcu_read_unlock();
+       return txq;
+}
+
+static void tun_set_real_num_queues(struct tun_struct *tun)
+{
+       netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
+       netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
+}
+
+static void __tun_detach(struct tun_file *tfile, bool clean)
+{
+       struct tun_file *ntfile;
+       struct tun_struct *tun;
+       struct net_device *dev;
+
+       tun = rcu_dereference_protected(tfile->tun,
+                                       lockdep_rtnl_is_held());
+       if (tun) {
+               u16 index = tfile->queue_index;
+               BUG_ON(index >= tun->numqueues);
+               dev = tun->dev;
+
+               rcu_assign_pointer(tun->tfiles[index],
+                                  tun->tfiles[tun->numqueues - 1]);
+               rcu_assign_pointer(tfile->tun, NULL);
+               ntfile = rcu_dereference_protected(tun->tfiles[index],
+                                                  lockdep_rtnl_is_held());
+               ntfile->queue_index = index;
+
+               --tun->numqueues;
+               sock_put(&tfile->sk);
+
+               synchronize_net();
+               /* Drop read queue */
+               skb_queue_purge(&tfile->sk.sk_receive_queue);
+               tun_set_real_num_queues(tun);
+
+               if (tun->numqueues == 0 && !(tun->flags & TUN_PERSIST))
+                       if (dev->reg_state == NETREG_REGISTERED)
+                               unregister_netdevice(dev);
+       }
+
+       if (clean) {
+               BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
+                                &tfile->socket.flags));
+               sk_release_kernel(&tfile->sk);
+       }
+}
+
+static void tun_detach(struct tun_file *tfile, bool clean)
+{
+       rtnl_lock();
+       __tun_detach(tfile, clean);
+       rtnl_unlock();
+}
+
+static void tun_detach_all(struct net_device *dev)
+{
+       struct tun_struct *tun = netdev_priv(dev);
+       struct tun_file *tfile;
+       int i, n = tun->numqueues;
+
+       for (i = 0; i < n; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               BUG_ON(!tfile);
+               wake_up_all(&tfile->wq.wait);
+               rcu_assign_pointer(tfile->tun, NULL);
+               --tun->numqueues;
+       }
+       BUG_ON(tun->numqueues != 0);
+
+       synchronize_net();
+       for (i = 0; i < n; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               /* Drop read queue */
+               skb_queue_purge(&tfile->sk.sk_receive_queue);
+               sock_put(&tfile->sk);
+       }
+}
+
 static int tun_attach(struct tun_struct *tun, struct file *file)
 {
        struct tun_file *tfile = file->private_data;
        int err;
 
-       ASSERT_RTNL();
-
-       netif_tx_lock_bh(tun->dev);
-
        err = -EINVAL;
-       if (tfile->tun)
+       if (rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held()))
                goto out;
 
        err = -EBUSY;
-       if (tun->tfile)
+       if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
+               goto out;
+
+       err = -E2BIG;
+       if (tun->numqueues == MAX_TAP_QUEUES)
                goto out;
 
        err = 0;
 
-       /* Re-attach filter when attaching to a persist device */
+       /* Re-attach the filter to presist device */
        if (tun->filter_attached == true) {
                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                if (!err)
                        goto out;
        }
+       tfile->queue_index = tun->numqueues;
        rcu_assign_pointer(tfile->tun, tun);
-       tfile->socket.sk->sk_sndbuf = tun->sndbuf;
-       rcu_assign_pointer(tun->tfile, tfile);
-       netif_carrier_on(tun->dev);
+       rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        sock_hold(&tfile->sk);
+       tun->numqueues++;
 
-out:
-       netif_tx_unlock_bh(tun->dev);
-       return err;
-}
+       tun_set_real_num_queues(tun);
 
-static void __tun_detach(struct tun_struct *tun)
-{
-       struct tun_file *tfile = rcu_dereference_protected(tun->tfile,
-                                                       lockdep_rtnl_is_held());
-       /* Detach from net device */
-       netif_carrier_off(tun->dev);
-       rcu_assign_pointer(tun->tfile, NULL);
-       if (tfile) {
-               rcu_assign_pointer(tfile->tun, NULL);
+       if (tun->numqueues == 1)
+               netif_carrier_on(tun->dev);
 
-               synchronize_net();
-               /* Drop read queue */
-               skb_queue_purge(&tfile->socket.sk->sk_receive_queue);
-       }
+       /* device is allowed to go away first, so no need to hold extra
+        * refcnt.
+        */
+
+out:
+       return err;
 }
 
 static struct tun_struct *__tun_get(struct tun_file *tfile)
@@ -349,30 +458,20 @@ static const struct ethtool_ops tun_ethtool_ops;
 /* Net device detach from fd. */
 static void tun_net_uninit(struct net_device *dev)
 {
-       struct tun_struct *tun = netdev_priv(dev);
-       struct tun_file *tfile = rcu_dereference_protected(tun->tfile,
-                                                       lockdep_rtnl_is_held());
-
-       /* Inform the methods they need to stop using the dev.
-        */
-       if (tfile) {
-               wake_up_all(&tfile->wq.wait);
-               __tun_detach(tun);
-               synchronize_net();
-       }
+       tun_detach_all(dev);
 }
 
 /* Net device open. */
 static int tun_net_open(struct net_device *dev)
 {
-       netif_start_queue(dev);
+       netif_tx_start_all_queues(dev);
        return 0;
 }
 
 /* Net device close. */
 static int tun_net_close(struct net_device *dev)
 {
-       netif_stop_queue(dev);
+       netif_tx_stop_all_queues(dev);
        return 0;
 }
 
@@ -380,16 +479,20 @@ static int tun_net_close(struct net_device *dev)
 static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct tun_struct *tun = netdev_priv(dev);
+       int txq = skb->queue_mapping;
        struct tun_file *tfile;
 
        rcu_read_lock();
-       tfile = rcu_dereference(tun->tfile);
+       tfile = rcu_dereference(tun->tfiles[txq]);
+
        /* Drop packet if interface is not attached */
-       if (!tfile)
+       if (txq >= tun->numqueues)
                goto drop;
 
        tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
 
+       BUG_ON(!tfile);
+
        /* Drop if the filter does not like it.
         * This is a noop if the filter is disabled.
         * Filter can be enabled only for the TAP devices. */
@@ -400,12 +503,15 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
            sk_filter(tfile->socket.sk, skb))
                goto drop;
 
+       /* Limit the number of packets queued by divining txq length with the
+        * number of queues.
+        */
        if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
-           >= dev->tx_queue_len) {
+                         >= dev->tx_queue_len / tun->numqueues){
                if (!(tun->flags & TUN_ONE_QUEUE)) {
                        /* Normal queueing mode. */
                        /* Packet scheduler handles dropping of further packets. */
-                       netif_stop_queue(dev);
+                       netif_stop_subqueue(dev, txq);
 
                        /* We won't see all dropped packets individually, so overrun
                         * error is more appropriate. */
@@ -494,6 +600,7 @@ static const struct net_device_ops tun_netdev_ops = {
        .ndo_start_xmit         = tun_net_xmit,
        .ndo_change_mtu         = tun_net_change_mtu,
        .ndo_fix_features       = tun_net_fix_features,
+       .ndo_select_queue       = tun_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = tun_poll_controller,
 #endif
@@ -509,6 +616,7 @@ static const struct net_device_ops tap_netdev_ops = {
        .ndo_set_rx_mode        = tun_net_mclist,
        .ndo_set_mac_address    = eth_mac_addr,
        .ndo_validate_addr      = eth_validate_addr,
+       .ndo_select_queue       = tun_select_queue,
 #ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller    = tun_poll_controller,
 #endif
@@ -550,7 +658,7 @@ static void tun_net_init(struct net_device *dev)
 /* Character device part */
 
 /* Poll */
-static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
+static unsigned int tun_chr_poll(struct file *file, poll_table *wait)
 {
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = __tun_get(tfile);
@@ -995,7 +1103,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
                        schedule();
                        continue;
                }
-               netif_wake_queue(tun->dev);
+               netif_wake_subqueue(tun->dev, tfile->queue_index);
 
                ret = tun_put_user(tun, tfile, skb, iv, len);
                kfree_skb(skb);
@@ -1156,6 +1264,9 @@ static int tun_flags(struct tun_struct *tun)
        if (tun->flags & TUN_VNET_HDR)
                flags |= IFF_VNET_HDR;
 
+       if (tun->flags & TUN_TAP_MQ)
+               flags |= IFF_MULTI_QUEUE;
+
        return flags;
 }
 
@@ -1247,8 +1358,9 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
                if (*ifr->ifr_name)
                        name = ifr->ifr_name;
 
-               dev = alloc_netdev(sizeof(struct tun_struct), name,
-                                  tun_setup);
+               dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
+                                      tun_setup,
+                                      MAX_TAP_QUEUES, MAX_TAP_QUEUES);
                if (!dev)
                        return -ENOMEM;
 
@@ -1283,7 +1395,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 
                err = tun_attach(tun, file);
                if (err < 0)
-                       goto failed;
+                       goto err_free_dev;
        }
 
        tun_debug(KERN_INFO, tun, "tun_set_iff\n");
@@ -1303,18 +1415,22 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
        else
                tun->flags &= ~TUN_VNET_HDR;
 
+       if (ifr->ifr_flags & IFF_MULTI_QUEUE)
+               tun->flags |= TUN_TAP_MQ;
+       else
+               tun->flags &= ~TUN_TAP_MQ;
+
        /* Make sure persistent devices do not get stuck in
         * xoff state.
         */
        if (netif_running(tun->dev))
-               netif_wake_queue(tun->dev);
+               netif_tx_wake_all_queues(tun->dev);
 
        strcpy(ifr->ifr_name, tun->dev->name);
        return 0;
 
  err_free_dev:
        free_netdev(dev);
- failed:
        return err;
 }
 
@@ -1369,6 +1485,51 @@ static int set_offload(struct tun_struct *tun, unsigned long arg)
        return 0;
 }
 
+static void tun_detach_filter(struct tun_struct *tun, int n)
+{
+       int i;
+       struct tun_file *tfile;
+
+       for (i = 0; i < n; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               sk_detach_filter(tfile->socket.sk);
+       }
+
+       tun->filter_attached = false;
+}
+
+static int tun_attach_filter(struct tun_struct *tun)
+{
+       int i, ret = 0;
+       struct tun_file *tfile;
+
+       for (i = 0; i < tun->numqueues; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                                 lockdep_rtnl_is_held());
+               ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
+               if (ret) {
+                       tun_detach_filter(tun, i);
+                       return ret;
+               }
+       }
+
+       tun->filter_attached = true;
+       return ret;
+}
+
+static void tun_set_sndbuf(struct tun_struct *tun)
+{
+       struct tun_file *tfile;
+       int i;
+
+       for (i = 0; i < tun->numqueues; i++) {
+               tfile = rcu_dereference_protected(tun->tfiles[i],
+                                               lockdep_rtnl_is_held());
+               tfile->socket.sk->sk_sndbuf = tun->sndbuf;
+       }
+}
+
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg, int ifreq_len)
 {
@@ -1397,6 +1558,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                                (unsigned int __user*)argp);
        }
 
+       ret = 0;
        rtnl_lock();
 
        tun = __tun_get(tfile);
@@ -1537,7 +1699,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                        break;
                }
 
-               tun->sndbuf = tfile->socket.sk->sk_sndbuf = sndbuf;
+               tun->sndbuf = sndbuf;
+               tun_set_sndbuf(tun);
                break;
 
        case TUNGETVNETHDRSZ:
@@ -1568,9 +1731,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
                        break;
 
-               ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
-               if (!ret)
-                       tun->filter_attached = true;
+               ret = tun_attach_filter(tun);
                break;
 
        case TUNDETACHFILTER:
@@ -1578,9 +1739,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
                        break;
-               ret = sk_detach_filter(tfile->socket.sk);
-               if (!ret)
-                       tun->filter_attached = false;
+               ret = 0;
+               tun_detach_filter(tun, tun->numqueues);
                break;
 
        default:
@@ -1685,37 +1845,9 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 static int tun_chr_close(struct inode *inode, struct file *file)
 {
        struct tun_file *tfile = file->private_data;
-       struct tun_struct *tun;
        struct net *net = tfile->net;
 
-       rtnl_lock();
-
-       tun = rcu_dereference_protected(tfile->tun, lockdep_rtnl_is_held());
-       if (tun) {
-               struct net_device *dev = tun->dev;
-
-               tun_debug(KERN_INFO, tun, "tun_chr_close\n");
-
-               __tun_detach(tun);
-
-               synchronize_net();
-
-               /* If desirable, unregister the netdevice. */
-               if (!(tun->flags & TUN_PERSIST)) {
-                       if (dev->reg_state == NETREG_REGISTERED)
-                               unregister_netdevice(dev);
-               }
-
-               /* drop the reference that netdevice holds */
-               sock_put(&tfile->sk);
-       }
-
-       rtnl_unlock();
-
-       /* drop the reference that file holds */
-       BUG_ON(!test_bit(SOCK_EXTERNALLY_ALLOCATED,
-                        &tfile->socket.flags));
-       sk_release_kernel(&tfile->sk);
+       tun_detach(tfile, true);
        put_net(net);
 
        return 0;