vxlan: Factor out vxlan send api.

[~andy/linux] / drivers / net / vxlan.c
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c

index a5ba8dd7e6bea33d060e9d5306b9eab6bb2095da..f3496e9953025d7d358b0dd62d8cc695b8350aeb 100644 (file)
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -41,6 +41,7 @@
  #include <net/inet_ecn.h>
  #include <net/net_namespace.h>
  #include <net/netns/generic.h>
+#include <net/vxlan.h>
  
  #define VXLAN_VERSION  "0.1"
  
@@ -57,6 +58,7 @@
  #define VXLAN_VID_MASK (VXLAN_N_VID - 1)
  /* IP header + UDP + VXLAN + Ethernet header */
  #define VXLAN_HEADROOM (20 + 8 + 8 + 14)
+#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
  
  #define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
  
@@ -82,16 +84,6 @@ static int vxlan_net_id;
  
  static const u8 all_zeros_mac[ETH_ALEN];
  
-/* per UDP socket information */
-struct vxlan_sock {
-       struct hlist_node hlist;
-       struct rcu_head   rcu;
-       struct work_struct del_work;
-       atomic_t          refcnt;
-       struct socket     *sock;
-       struct hlist_head vni_list[VNI_HASH_SIZE];
-};
-
  /* per-network namespace private data for this module */
  struct vxlan_net {
         struct list_head  vxlan_list;
@@ -136,7 +128,8 @@ struct vxlan_dev {
         u32               flags;        /* VXLAN_F_* below */
  
         struct work_struct sock_work;
-       struct work_struct igmp_work;
+       struct work_struct igmp_join;
+       struct work_struct igmp_leave;
  
         unsigned long     age_interval;
         struct timer_list age_timer;
@@ -176,13 +169,18 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port)
  /* First remote destination for a forwarding entry.
   * Guaranteed to be non-NULL because remotes are never deleted.
   */
-static inline struct vxlan_rdst *first_remote(struct vxlan_fdb *fdb)
+static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
  {
-       return list_first_or_null_rcu(&fdb->remotes, struct vxlan_rdst, list);
+       return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
+}
+
+static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
+{
+       return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
  }
  
  /* Find VXLAN socket based on network namespace and UDP port */
-static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
+static struct vxlan_sock *vxlan_find_sock(struct net *net, __be16 port)
  {
         struct vxlan_sock *vs;
  
@@ -193,16 +191,10 @@ static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
         return NULL;
  }
  
-/* Look up VNI in a per net namespace table */
-static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
+static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id)
  {
-       struct vxlan_sock *vs;
         struct vxlan_dev *vxlan;
  
-       vs = vxlan_find_port(net, port);
-       if (!vs)
-               return NULL;
-
         hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
                 if (vxlan->default_dst.remote_vni == id)
                         return vxlan;
@@ -211,6 +203,18 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
         return NULL;
  }
  
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
+{
+       struct vxlan_sock *vs;
+
+       vs = vxlan_find_sock(net, port);
+       if (!vs)
+               return NULL;
+
+       return vxlan_vs_find_vni(vs, id);
+}
+
  /* Fill in neighbour message in skbuff. */
  static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
                           const struct vxlan_fdb *fdb,
@@ -296,7 +300,8 @@ static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
         if (skb == NULL)
                 goto errout;
  
-       err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, first_remote(fdb));
+       err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0,
+                            first_remote_rtnl(fdb));
         if (err < 0) {
                 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
                 WARN_ON(err == -EMSGSIZE);
@@ -407,6 +412,26 @@ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
         return NULL;
  }
  
+/* Replace destination of unicast mac */
+static int vxlan_fdb_replace(struct vxlan_fdb *f,
+                           __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
+{
+       struct vxlan_rdst *rd;
+
+       rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
+       if (rd)
+               return 0;
+
+       rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
+       if (!rd)
+               return 0;
+       rd->remote_ip = ip;
+       rd->remote_port = port;
+       rd->remote_vni = vni;
+       rd->remote_ifindex = ifindex;
+       return 1;
+}
+
  /* Add/update destinations for multicast */
  static int vxlan_fdb_append(struct vxlan_fdb *f,
                             __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
@@ -457,6 +482,19 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
                         f->updated = jiffies;
                         notify = 1;
                 }
+               if ((flags & NLM_F_REPLACE)) {
+                       /* Only change unicasts */
+                       if (!(is_multicast_ether_addr(f->eth_addr) ||
+                            is_zero_ether_addr(f->eth_addr))) {
+                               int rc = vxlan_fdb_replace(f, ip, port, vni,
+                                                          ifindex);
+
+                               if (rc < 0)
+                                       return rc;
+                               notify |= rc;
+                       } else
+                               return -EOPNOTSUPP;
+               }
                 if ((flags & NLM_F_APPEND) &&
                     (is_multicast_ether_addr(f->eth_addr) ||
                      is_zero_ether_addr(f->eth_addr))) {
@@ -473,6 +511,11 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
                 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
                         return -ENOSPC;
  
+               /* Disallow replace to add a multicast entry */
+               if ((flags & NLM_F_REPLACE) &&
+                   (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
+                       return -EOPNOTSUPP;
+
                 netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
                 f = kmalloc(sizeof(*f), GFP_ATOMIC);
                 if (!f)
@@ -701,7 +744,7 @@ static bool vxlan_snoop(struct net_device *dev,
  
         f = vxlan_find_mac(vxlan, src_mac);
         if (likely(f)) {
-               struct vxlan_rdst *rdst = first_remote(f);
+               struct vxlan_rdst *rdst = first_remote_rcu(f);
  
                 if (likely(rdst->remote_ip == src_ip))
                         return false;
@@ -736,7 +779,6 @@ static bool vxlan_snoop(struct net_device *dev,
         return false;
  }
  
-
  /* See if multicast group is already in use by other ID */
  static bool vxlan_group_used(struct vxlan_net *vn, __be32 remote_ip)
  {
@@ -758,8 +800,10 @@ static void vxlan_sock_hold(struct vxlan_sock *vs)
         atomic_inc(&vs->refcnt);
  }
  
-static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs)
+void vxlan_sock_release(struct vxlan_sock *vs)
  {
+       struct vxlan_net *vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
+
         if (!atomic_dec_and_test(&vs->refcnt))
                 return;
  
@@ -769,14 +813,15 @@ static void vxlan_sock_release(struct vxlan_net *vn, struct vxlan_sock *vs)
  
         queue_work(vxlan_wq, &vs->del_work);
  }
+EXPORT_SYMBOL_GPL(vxlan_sock_release);
  
-/* Callback to update multicast group membership.
- * Scheduled when vxlan goes up/down.
+/* Callback to update multicast group membership when first VNI on
+ * multicast asddress is brought up
+ * Done as workqueue because ip_mc_join_group acquires RTNL.
   */
-static void vxlan_igmp_work(struct work_struct *work)
+static void vxlan_igmp_join(struct work_struct *work)
  {
-       struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_work);
-       struct vxlan_net *vn = net_generic(dev_net(vxlan->dev), vxlan_net_id);
+       struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join);
         struct vxlan_sock *vs = vxlan->vn_sock;
         struct sock *sk = vs->sock->sk;
         struct ip_mreqn mreq = {
@@ -785,36 +830,45 @@ static void vxlan_igmp_work(struct work_struct *work)
         };
  
         lock_sock(sk);
-       if (vxlan_group_used(vn, vxlan->default_dst.remote_ip))
-               ip_mc_join_group(sk, &mreq);
-       else
-               ip_mc_leave_group(sk, &mreq);
+       ip_mc_join_group(sk, &mreq);
         release_sock(sk);
  
-       vxlan_sock_release(vn, vs);
+       vxlan_sock_release(vs);
+       dev_put(vxlan->dev);
+}
+
+/* Inverse of vxlan_igmp_join when last VNI is brought down */
+static void vxlan_igmp_leave(struct work_struct *work)
+{
+       struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave);
+       struct vxlan_sock *vs = vxlan->vn_sock;
+       struct sock *sk = vs->sock->sk;
+       struct ip_mreqn mreq = {
+               .imr_multiaddr.s_addr   = vxlan->default_dst.remote_ip,
+               .imr_ifindex            = vxlan->default_dst.remote_ifindex,
+       };
+
+       lock_sock(sk);
+       ip_mc_leave_group(sk, &mreq);
+       release_sock(sk);
+
+       vxlan_sock_release(vs);
         dev_put(vxlan->dev);
  }
  
  /* Callback from net/ipv4/udp.c to receive packets */
  static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
  {
-       struct iphdr *oip;
+       struct vxlan_sock *vs;
         struct vxlanhdr *vxh;
-       struct vxlan_dev *vxlan;
-       struct pcpu_tstats *stats;
         __be16 port;
-       __u32 vni;
-       int err;
-
-       /* pop off outer UDP header */
-       __skb_pull(skb, sizeof(struct udphdr));
  
         /* Need Vxlan and inner Ethernet header to be present */
-       if (!pskb_may_pull(skb, sizeof(struct vxlanhdr)))
+       if (!pskb_may_pull(skb, VXLAN_HLEN))
                 goto error;
  
-       /* Drop packets with reserved bits set */
-       vxh = (struct vxlanhdr *) skb->data;
+       /* Return packets with reserved bits set */
+       vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
         if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
             (vxh->vx_vni & htonl(0xff))) {
                 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
@@ -822,28 +876,44 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                 goto error;
         }
  
-       __skb_pull(skb, sizeof(struct vxlanhdr));
+       if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
+               goto drop;
  
-       /* Is this VNI defined? */
-       vni = ntohl(vxh->vx_vni) >> 8;
         port = inet_sk(sk)->inet_sport;
-       vxlan = vxlan_find_vni(sock_net(sk), vni, port);
-       if (!vxlan) {
-               netdev_dbg(skb->dev, "unknown vni %d port %u\n",
-                          vni, ntohs(port));
+
+       vs = vxlan_find_sock(sock_net(sk), port);
+       if (!vs)
                 goto drop;
-       }
  
-       if (!pskb_may_pull(skb, ETH_HLEN)) {
-               vxlan->dev->stats.rx_length_errors++;
-               vxlan->dev->stats.rx_errors++;
+       vs->rcv(vs, skb, vxh->vx_vni);
+       return 0;
+
+drop:
+       /* Consume bad packet */
+       kfree_skb(skb);
+       return 0;
+
+error:
+       /* Return non vxlan pkt */
+       return 1;
+}
+
+static void vxlan_rcv(struct vxlan_sock *vs,
+                     struct sk_buff *skb, __be32 vx_vni)
+{
+       struct iphdr *oip;
+       struct vxlan_dev *vxlan;
+       struct pcpu_tstats *stats;
+       __u32 vni;
+       int err;
+
+       vni = ntohl(vx_vni) >> 8;
+       /* Is this VNI defined? */
+       vxlan = vxlan_vs_find_vni(vs, vni);
+       if (!vxlan)
                 goto drop;
-       }
  
         skb_reset_mac_header(skb);
-
-       /* Re-examine inner Ethernet packet */
-       oip = ip_hdr(skb);
         skb->protocol = eth_type_trans(skb, vxlan->dev);
  
         /* Ignore packet loops (and multicast echo) */
@@ -851,11 +921,12 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
                                vxlan->dev->dev_addr) == 0)
                 goto drop;
  
+       /* Re-examine inner Ethernet packet */
+       oip = ip_hdr(skb);
         if ((vxlan->flags & VXLAN_F_LEARN) &&
             vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source))
                 goto drop;
  
-       __skb_tunnel_rx(skb, vxlan->dev);
         skb_reset_network_header(skb);
  
         /* If the NIC driver gave us an encapsulated packet with
@@ -889,16 +960,10 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
  
         netif_rx(skb);
  
-       return 0;
-error:
-       /* Put UDP header back */
-       __skb_push(skb, sizeof(struct udphdr));
-
-       return 1;
+       return;
  drop:
         /* Consume bad packet */
         kfree_skb(skb);
-       return 0;
  }
  
  static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
@@ -949,7 +1014,7 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
                 }
  
                 f = vxlan_find_mac(vxlan, n->ha);
-               if (f && first_remote(f)->remote_ip == htonl(INADDR_ANY)) {
+               if (f && first_remote_rcu(f)->remote_ip == htonl(INADDR_ANY)) {
                         /* bridge-local neighbor */
                         neigh_release(n);
                         goto out;
@@ -1017,11 +1082,8 @@ static void vxlan_sock_put(struct sk_buff *skb)
  }
  
  /* On transmit, associate with the tunnel socket */
-static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
+static void vxlan_set_owner(struct sock *sk, struct sk_buff *skb)
  {
-       struct vxlan_dev *vxlan = netdev_priv(dev);
-       struct sock *sk = vxlan->vn_sock->sock->sk;
-
         skb_orphan(skb);
         sock_hold(sk);
         skb->sk = sk;
@@ -1033,9 +1095,9 @@ static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
   *     better and maybe available from hardware
   *   secondary choice is to use jhash on the Ethernet header
   */
-static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb)
+__be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb)
  {
-       unsigned int range = (vxlan->port_max - vxlan->port_min) + 1;
+       unsigned int range = (port_max - port_min) + 1;
         u32 hash;
  
         hash = skb_get_rxhash(skb);
@@ -1043,8 +1105,9 @@ static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb)
                 hash = jhash(skb->data, 2 * ETH_ALEN,
                              (__force u32) skb->protocol);
  
-       return htons((((u64) hash * range) >> 32) + vxlan->port_min);
+       return htons((((u64) hash * range) >> 32) + port_min);
  }
+EXPORT_SYMBOL_GPL(vxlan_src_port);
  
  static int handle_offloads(struct sk_buff *skb)
  {
@@ -1060,6 +1123,45 @@ static int handle_offloads(struct sk_buff *skb)
         return 0;
  }
  
+int vxlan_xmit_skb(struct net *net, struct vxlan_sock *vs,
+                  struct rtable *rt, struct sk_buff *skb,
+                  __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
+                  __be16 src_port, __be16 dst_port, __be32 vni)
+{
+       struct vxlanhdr *vxh;
+       struct udphdr *uh;
+       int err;
+
+       if (!skb->encapsulation) {
+               skb_reset_inner_headers(skb);
+               skb->encapsulation = 1;
+       }
+
+       vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+       vxh->vx_flags = htonl(VXLAN_FLAGS);
+       vxh->vx_vni = vni;
+
+       __skb_push(skb, sizeof(*uh));
+       skb_reset_transport_header(skb);
+       uh = udp_hdr(skb);
+
+       uh->dest = dst_port;
+       uh->source = src_port;
+
+       uh->len = htons(skb->len);
+       uh->check = 0;
+
+       vxlan_set_owner(vs->sock->sk, skb);
+
+       err = handle_offloads(skb);
+       if (err)
+               return err;
+
+       return iptunnel_xmit(net, rt, skb, src, dst,
+                       IPPROTO_UDP, tos, ttl, df);
+}
+EXPORT_SYMBOL_GPL(vxlan_xmit_skb);
+
  /* Bypass encapsulation if the destination is local */
  static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
                                struct vxlan_dev *dst_vxlan)
@@ -1097,8 +1199,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
         struct vxlan_dev *vxlan = netdev_priv(dev);
         struct rtable *rt;
         const struct iphdr *old_iph;
-       struct vxlanhdr *vxh;
-       struct udphdr *uh;
         struct flowi4 fl4;
         __be32 dst;
         __be16 src_port, dst_port;
@@ -1120,11 +1220,6 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                 goto drop;
         }
  
-       if (!skb->encapsulation) {
-               skb_reset_inner_headers(skb);
-               skb->encapsulation = 1;
-       }
-
         /* Need space for new headers (invalidates iph ptr) */
         if (skb_cow_head(skb, VXLAN_HEADROOM))
                 goto drop;
@@ -1139,7 +1234,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
         if (tos == 1)
                 tos = ip_tunnel_get_dsfield(old_iph, skb);
  
-       src_port = vxlan_src_port(vxlan, skb);
+       src_port = vxlan_src_port(vxlan->port_min, vxlan->port_max, skb);
  
         memset(&fl4, 0, sizeof(fl4));
         fl4.flowi4_oif = rdst->remote_ifindex;
@@ -1156,9 +1251,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
  
         if (rt->dst.dev == dev) {
                 netdev_dbg(dev, "circular route to %pI4\n", &dst);
-               ip_rt_put(rt);
                 dev->stats.collisions++;
-               goto tx_error;
+               goto rt_tx_error;
         }
  
         /* Bypass encapsulation if the destination is local */
@@ -1173,30 +1267,16 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                 vxlan_encap_bypass(skb, vxlan, dst_vxlan);
                 return;
         }
-       vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
-       vxh->vx_flags = htonl(VXLAN_FLAGS);
-       vxh->vx_vni = htonl(vni << 8);
-
-       __skb_push(skb, sizeof(*uh));
-       skb_reset_transport_header(skb);
-       uh = udp_hdr(skb);
-
-       uh->dest = dst_port;
-       uh->source = src_port;
-
-       uh->len = htons(skb->len);
-       uh->check = 0;
-
-       vxlan_set_owner(dev, skb);
-
-       if (handle_offloads(skb))
-               goto drop;
  
         tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
         ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
  
-       err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, dst,
-                           IPPROTO_UDP, tos, ttl, df);
+       err = vxlan_xmit_skb(dev_net(dev), vxlan->vn_sock, rt, skb,
+                            fl4.saddr, dst, tos, ttl, df,
+                            src_port, dst_port, htonl(vni << 8));
+
+       if (err < 0)
+               goto rt_tx_error;
         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
  
         return;
@@ -1205,6 +1285,8 @@ drop:
         dev->stats.tx_dropped++;
         goto tx_free;
  
+rt_tx_error:
+       ip_rt_put(rt);
  tx_error:
         dev->stats.tx_errors++;
  tx_free:
@@ -1303,25 +1385,31 @@ static void vxlan_cleanup(unsigned long arg)
         mod_timer(&vxlan->age_timer, next_timer);
  }
  
+static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
+{
+       __u32 vni = vxlan->default_dst.remote_vni;
+
+       vxlan->vn_sock = vs;
+       hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
+}
+
  /* Setup stats when device is created */
  static int vxlan_init(struct net_device *dev)
  {
         struct vxlan_dev *vxlan = netdev_priv(dev);
         struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
         struct vxlan_sock *vs;
-       __u32 vni = vxlan->default_dst.remote_vni;
  
         dev->tstats = alloc_percpu(struct pcpu_tstats);
         if (!dev->tstats)
                 return -ENOMEM;
  
         spin_lock(&vn->sock_lock);
-       vs = vxlan_find_port(dev_net(dev), vxlan->dst_port);
+       vs = vxlan_find_sock(dev_net(dev), vxlan->dst_port);
         if (vs) {
                 /* If we have a socket with same port already, reuse it */
                 atomic_inc(&vs->refcnt);
-               vxlan->vn_sock = vs;
-               hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
+               vxlan_vs_add_dev(vs, vxlan);
         } else {
                 /* otherwise make new socket outside of RTNL */
                 dev_hold(dev);
@@ -1346,19 +1434,19 @@ static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan)
  static void vxlan_uninit(struct net_device *dev)
  {
         struct vxlan_dev *vxlan = netdev_priv(dev);
-       struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
         struct vxlan_sock *vs = vxlan->vn_sock;
  
         vxlan_fdb_delete_default(vxlan);
  
         if (vs)
-               vxlan_sock_release(vn, vs);
+               vxlan_sock_release(vs);
         free_percpu(dev->tstats);
  }
  
  /* Start ageing timer and join group when device is brought up */
  static int vxlan_open(struct net_device *dev)
  {
+       struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
         struct vxlan_dev *vxlan = netdev_priv(dev);
         struct vxlan_sock *vs = vxlan->vn_sock;
  
@@ -1366,10 +1454,11 @@ static int vxlan_open(struct net_device *dev)
         if (!vs)
                 return -ENOTCONN;
  
-       if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
+       if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) &&
+           vxlan_group_used(vn, vxlan->default_dst.remote_ip)) {
                 vxlan_sock_hold(vs);
                 dev_hold(dev);
-               queue_work(vxlan_wq, &vxlan->igmp_work);
+               queue_work(vxlan_wq, &vxlan->igmp_join);
         }
  
         if (vxlan->age_interval)
@@ -1400,13 +1489,15 @@ static void vxlan_flush(struct vxlan_dev *vxlan)
  /* Cleanup timer and forwarding table on shutdown */
  static int vxlan_stop(struct net_device *dev)
  {
+       struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
         struct vxlan_dev *vxlan = netdev_priv(dev);
         struct vxlan_sock *vs = vxlan->vn_sock;
  
-       if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
+       if (vs && IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)) &&
+           ! vxlan_group_used(vn, vxlan->default_dst.remote_ip)) {
                 vxlan_sock_hold(vs);
                 dev_hold(dev);
-               queue_work(vxlan_wq, &vxlan->igmp_work);
+               queue_work(vxlan_wq, &vxlan->igmp_leave);
         }
  
         del_timer_sync(&vxlan->age_timer);
@@ -1471,7 +1562,8 @@ static void vxlan_setup(struct net_device *dev)
  
         INIT_LIST_HEAD(&vxlan->next);
         spin_lock_init(&vxlan->hash_lock);
-       INIT_WORK(&vxlan->igmp_work, vxlan_igmp_work);
+       INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join);
+       INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave);
         INIT_WORK(&vxlan->sock_work, vxlan_sock_work);
  
         init_timer_deferrable(&vxlan->age_timer);
@@ -1564,8 +1656,10 @@ static void vxlan_del_work(struct work_struct *work)
         kfree_rcu(vs, rcu);
  }
  
-static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
+static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
+                                             vxlan_rcv_t *rcv, void *data)
  {
+       struct vxlan_net *vn = net_generic(net, vxlan_net_id);
         struct vxlan_sock *vs;
         struct sock *sk;
         struct sockaddr_in vxlan_addr = {
@@ -1577,8 +1671,10 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
         unsigned int h;
  
         vs = kmalloc(sizeof(*vs), GFP_KERNEL);
-       if (!vs)
+       if (!vs) {
+               pr_debug("memory alocation failure\n");
                 return ERR_PTR(-ENOMEM);
+       }
  
         for (h = 0; h < VNI_HASH_SIZE; ++h)
                 INIT_HLIST_HEAD(&vs->vni_list[h]);
@@ -1606,57 +1702,70 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
                 kfree(vs);
                 return ERR_PTR(rc);
         }
+       atomic_set(&vs->refcnt, 1);
+       vs->rcv = rcv;
+       vs->data = data;
  
         /* Disable multicast loopback */
         inet_sk(sk)->mc_loop = 0;
+       spin_lock(&vn->sock_lock);
+       hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
+       spin_unlock(&vn->sock_lock);
  
         /* Mark socket as an encapsulation socket. */
         udp_sk(sk)->encap_type = 1;
         udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
         udp_encap_enable();
-       atomic_set(&vs->refcnt, 1);
+       return vs;
+}
+
+struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
+                                 vxlan_rcv_t *rcv, void *data,
+                                 bool no_share)
+{
+       struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+       struct vxlan_sock *vs;
+
+       vs = vxlan_socket_create(net, port, rcv, data);
+       if (!IS_ERR(vs))
+               return vs;
+
+       if (no_share)   /* Return error if sharing is not allowed. */
+               return vs;
+
+       spin_lock(&vn->sock_lock);
+       vs = vxlan_find_sock(net, port);
+       if (vs) {
+               if (vs->rcv == rcv)
+                       atomic_inc(&vs->refcnt);
+               else
+                       vs = ERR_PTR(-EBUSY);
+       }
+       spin_unlock(&vn->sock_lock);
+
+       if (!vs)
+               vs = ERR_PTR(-EINVAL);
  
         return vs;
  }
+EXPORT_SYMBOL_GPL(vxlan_sock_add);
  
  /* Scheduled at device creation to bind to a socket */
  static void vxlan_sock_work(struct work_struct *work)
  {
-       struct vxlan_dev *vxlan
-               = container_of(work, struct vxlan_dev, sock_work);
-       struct net_device *dev = vxlan->dev;
-       struct net *net = dev_net(dev);
-       __u32 vni = vxlan->default_dst.remote_vni;
-       __be16 port = vxlan->dst_port;
+       struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work);
+       struct net *net = dev_net(vxlan->dev);
         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
-       struct vxlan_sock *nvs, *ovs;
-
-       nvs = vxlan_socket_create(net, port);
-       if (IS_ERR(nvs)) {
-               netdev_err(vxlan->dev, "Can not create UDP socket, %ld\n",
-                          PTR_ERR(nvs));
-               goto out;
-       }
+       __be16 port = vxlan->dst_port;
+       struct vxlan_sock *nvs;
  
+       nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false);
         spin_lock(&vn->sock_lock);
-       /* Look again to see if can reuse socket */
-       ovs = vxlan_find_port(net, port);
-       if (ovs) {
-               atomic_inc(&ovs->refcnt);
-               vxlan->vn_sock = ovs;
-               hlist_add_head_rcu(&vxlan->hlist, vni_head(ovs, vni));
-               spin_unlock(&vn->sock_lock);
-
-               sk_release_kernel(nvs->sock->sk);
-               kfree(nvs);
-       } else {
-               vxlan->vn_sock = nvs;
-               hlist_add_head_rcu(&nvs->hlist, vs_head(net, port));
-               hlist_add_head_rcu(&vxlan->hlist, vni_head(nvs, vni));
-               spin_unlock(&vn->sock_lock);
-       }
-out:
-       dev_put(dev);
+       if (!IS_ERR(nvs))
+               vxlan_vs_add_dev(nvs, vxlan);
+       spin_unlock(&vn->sock_lock);
+
+       dev_put(vxlan->dev);
  }
  
  static int vxlan_newlink(struct net *net, struct net_device *dev,
@@ -1770,10 +1879,9 @@ static void vxlan_dellink(struct net_device *dev, struct list_head *head)
         struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
         struct vxlan_dev *vxlan = netdev_priv(dev);
  
-       flush_workqueue(vxlan_wq);
-
         spin_lock(&vn->sock_lock);
-       hlist_del_rcu(&vxlan->hlist);
+       if (!hlist_unhashed(&vxlan->hlist))
+               hlist_del_rcu(&vxlan->hlist);
         spin_unlock(&vn->sock_lock);
  
         list_del(&vxlan->next);
@@ -1878,10 +1986,12 @@ static __net_exit void vxlan_exit_net(struct net *net)
  {
         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
         struct vxlan_dev *vxlan;
+       LIST_HEAD(list);
  
         rtnl_lock();
         list_for_each_entry(vxlan, &vn->vxlan_list, next)
-               dev_close(vxlan->dev);
+               unregister_netdevice_queue(vxlan->dev, &list);
+       unregister_netdevice_many(&list);
         rtnl_unlock();
  }