ipip: add x-netns support

[~andy/linux] / net / ipv4 / ip_tunnel.c
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c

index 7fa8f08fa7ae08220bfdb3a9bd4ab7077a1c5a9e..a4d9126c7b514a7f8e1dead0335023a4afc8e8b5 100644 (file)
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -304,6 +304,7 @@ static struct net_device *__ip_tunnel_create(struct net *net,
  
         tunnel = netdev_priv(dev);
         tunnel->parms = *parms;
+       tunnel->net = net;
  
         err = register_netdevice(dev);
         if (err)
@@ -349,7 +350,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
                 struct flowi4 fl4;
                 struct rtable *rt;
  
-               rt = ip_route_output_tunnel(dev_net(dev), &fl4,
+               rt = ip_route_output_tunnel(tunnel->net, &fl4,
                                             tunnel->parms.iph.protocol,
                                             iph->daddr, iph->saddr,
                                             tunnel->parms.o_key,
@@ -364,7 +365,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
         }
  
         if (!tdev && tunnel->parms.link)
-               tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
+               tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
  
         if (tdev) {
                 hlen = tdev->hard_header_len + tdev->needed_headroom;
@@ -408,13 +409,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
         const struct iphdr *iph = ip_hdr(skb);
         int err;
  
-       secpath_reset(skb);
-
-       skb->protocol = tpi->proto;
-
-       skb->mac_header = skb->network_header;
-       __pskb_pull(skb, tunnel->hlen);
-       skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
  #ifdef CONFIG_NET_IPGRE_BROADCAST
         if (ipv4_is_multicast(iph->daddr)) {
                 /* Looped back packet, drop it! */
@@ -442,23 +436,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
         }
  
-       /* Warning: All skb pointers will be invalidated! */
-       if (tunnel->dev->type == ARPHRD_ETHER) {
-               if (!pskb_may_pull(skb, ETH_HLEN)) {
-                       tunnel->dev->stats.rx_length_errors++;
-                       tunnel->dev->stats.rx_errors++;
-                       goto drop;
-               }
-
-               iph = ip_hdr(skb);
-               skb->protocol = eth_type_trans(skb, tunnel->dev);
-               skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
-       }
-
-       skb->pkt_type = PACKET_HOST;
-       __skb_tunnel_rx(skb, tunnel->dev);
-
-       skb_reset_network_header(skb);
         err = IP_ECN_decapsulate(iph, skb);
         if (unlikely(err)) {
                 if (log_ecn_error)
@@ -477,6 +454,16 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
         tstats->rx_bytes += skb->len;
         u64_stats_update_end(&tstats->syncp);
  
+       if (tunnel->dev->type == ARPHRD_ETHER) {
+               skb->protocol = eth_type_trans(skb, tunnel->dev);
+               skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+       } else {
+               skb->dev = tunnel->dev;
+       }
+
+       if (!net_eq(tunnel->net, dev_net(tunnel->dev)))
+               skb_scrub_packet(skb);
+
         gro_cells_receive(&tunnel->gro_cells, skb);
         return 0;
  
@@ -486,24 +473,69 @@ drop:
  }
  EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
  
+static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+                           struct rtable *rt, __be16 df)
+{
+       struct ip_tunnel *tunnel = netdev_priv(dev);
+       int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+       int mtu;
+
+       if (df)
+               mtu = dst_mtu(&rt->dst) - dev->hard_header_len
+                                       - sizeof(struct iphdr) - tunnel->hlen;
+       else
+               mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+
+       if (skb_dst(skb))
+               skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+       if (skb->protocol == htons(ETH_P_IP)) {
+               if (!skb_is_gso(skb) &&
+                   (df & htons(IP_DF)) && mtu < pkt_size) {
+                       memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+                       return -E2BIG;
+               }
+       }
+#if IS_ENABLED(CONFIG_IPV6)
+       else if (skb->protocol == htons(ETH_P_IPV6)) {
+               struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+
+               if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
+                          mtu >= IPV6_MIN_MTU) {
+                       if ((tunnel->parms.iph.daddr &&
+                           !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+                           rt6->rt6i_dst.plen == 128) {
+                               rt6->rt6i_flags |= RTF_MODIFIED;
+                               dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
+                       }
+               }
+
+               if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
+                                       mtu < pkt_size) {
+                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+                       return -E2BIG;
+               }
+       }
+#endif
+       return 0;
+}
+
  void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
-                   const struct iphdr *tnl_params)
+                   const struct iphdr *tnl_params, const u8 protocol)
  {
         struct ip_tunnel *tunnel = netdev_priv(dev);
         const struct iphdr *inner_iph;
-       struct iphdr *iph;
         struct flowi4 fl4;
         u8     tos, ttl;
         __be16 df;
         struct rtable *rt;              /* Route to the other host */
-       struct net_device *tdev;        /* Device to other host */
         unsigned int max_headroom;      /* The extra header space needed */
         __be32 dst;
-       int mtu;
+       int err;
  
         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
  
-       memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
         dst = tnl_params->daddr;
         if (dst == 0) {
                 /* NBMA tunnel */
@@ -561,8 +593,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
         }
  
-       rt = ip_route_output_tunnel(dev_net(dev), &fl4,
-                                   tunnel->parms.iph.protocol,
+       rt = ip_route_output_tunnel(tunnel->net, &fl4,
+                                   protocol,
                                     dst, tnl_params->saddr,
                                     tunnel->parms.o_key,
                                     RT_TOS(tos),
@@ -571,58 +603,19 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                 dev->stats.tx_carrier_errors++;
                 goto tx_error;
         }
-       tdev = rt->dst.dev;
-
-       if (tdev == dev) {
+       if (rt->dst.dev == dev) {
                 ip_rt_put(rt);
                 dev->stats.collisions++;
                 goto tx_error;
         }
  
-       df = tnl_params->frag_off;
-
-       if (df)
-               mtu = dst_mtu(&rt->dst) - dev->hard_header_len
-                                       - sizeof(struct iphdr);
-       else
-               mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
-
-       if (skb_dst(skb))
-               skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
-
-       if (skb->protocol == htons(ETH_P_IP)) {
-               df |= (inner_iph->frag_off&htons(IP_DF));
-
-               if (!skb_is_gso(skb) &&
-                   (inner_iph->frag_off&htons(IP_DF)) &&
-                    mtu < ntohs(inner_iph->tot_len)) {
-                       icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
-                       ip_rt_put(rt);
-                       goto tx_error;
-               }
+       if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
+               ip_rt_put(rt);
+               goto tx_error;
         }
-#if IS_ENABLED(CONFIG_IPV6)
-       else if (skb->protocol == htons(ETH_P_IPV6)) {
-               struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
-
-               if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
-                   mtu >= IPV6_MIN_MTU) {
-                       if ((tunnel->parms.iph.daddr &&
-                           !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
-                           rt6->rt6i_dst.plen == 128) {
-                               rt6->rt6i_flags |= RTF_MODIFIED;
-                               dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
-                       }
-               }
  
-               if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
-                   mtu < skb->len) {
-                       icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-                       ip_rt_put(rt);
-                       goto tx_error;
-               }
-       }
-#endif
+       if (!net_eq(tunnel->net, dev_net(dev)))
+               skb_scrub_packet(skb);
  
         if (tunnel->err_count > 0) {
                 if (time_before(jiffies,
@@ -646,8 +639,12 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                         ttl = ip4_dst_hoplimit(&rt->dst);
         }
  
-       max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr)
-                                              + rt->dst.header_len;
+       df = tnl_params->frag_off;
+       if (skb->protocol == htons(ETH_P_IP))
+               df |= (inner_iph->frag_off&htons(IP_DF));
+
+       max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+                       + rt->dst.header_len;
         if (max_headroom > dev->needed_headroom) {
                 dev->needed_headroom = max_headroom;
                 if (skb_cow_head(skb, dev->needed_headroom)) {
@@ -657,27 +654,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
                 }
         }
  
-       skb_dst_drop(skb);
-       skb_dst_set(skb, &rt->dst);
-
-       /* Push down and install the IP header. */
-       skb_push(skb, sizeof(struct iphdr));
-       skb_reset_network_header(skb);
+       err = iptunnel_xmit(tunnel->net, rt, skb,
+                           fl4.saddr, fl4.daddr, protocol,
+                           ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
+       iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
  
-       iph = ip_hdr(skb);
-       inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
-
-       iph->version    =       4;
-       iph->ihl        =       sizeof(struct iphdr) >> 2;
-       iph->frag_off   =       df;
-       iph->protocol   =       tnl_params->protocol;
-       iph->tos        =       ip_tunnel_ecn_encap(tos, inner_iph, skb);
-       iph->daddr      =       fl4.daddr;
-       iph->saddr      =       fl4.saddr;
-       iph->ttl        =       ttl;
-       tunnel_ip_select_ident(skb, inner_iph, &rt->dst);
-
-       iptunnel_xmit(skb, dev);
         return;
  
  #if IS_ENABLED(CONFIG_IPV6)
@@ -840,11 +821,10 @@ static void ip_tunnel_dev_free(struct net_device *dev)
  
  void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
  {
-       struct net *net = dev_net(dev);
         struct ip_tunnel *tunnel = netdev_priv(dev);
         struct ip_tunnel_net *itn;
  
-       itn = net_generic(net, tunnel->ip_tnl_net_id);
+       itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
  
         if (itn->fb_tunnel_dev != dev) {
                 ip_tunnel_del(netdev_priv(dev));
@@ -858,56 +838,70 @@ int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
  {
         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
         struct ip_tunnel_parm parms;
+       unsigned int i;
  
-       itn->tunnels = kzalloc(IP_TNL_HASH_SIZE * sizeof(struct hlist_head), GFP_KERNEL);
-       if (!itn->tunnels)
-               return -ENOMEM;
+       for (i = 0; i < IP_TNL_HASH_SIZE; i++)
+               INIT_HLIST_HEAD(&itn->tunnels[i]);
  
         if (!ops) {
                 itn->fb_tunnel_dev = NULL;
                 return 0;
         }
+
         memset(&parms, 0, sizeof(parms));
         if (devname)
                 strlcpy(parms.name, devname, IFNAMSIZ);
  
         rtnl_lock();
         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
+       /* FB netdevice is special: we have one, and only one per netns.
+        * Allowing to move it to another netns is clearly unsafe.
+        */
+       itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
         rtnl_unlock();
-       if (IS_ERR(itn->fb_tunnel_dev)) {
-               kfree(itn->tunnels);
+
+       if (IS_ERR(itn->fb_tunnel_dev))
                 return PTR_ERR(itn->fb_tunnel_dev);
-       }
  
         return 0;
  }
  EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
  
-static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head)
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+                             struct rtnl_link_ops *ops)
  {
+       struct net *net = dev_net(itn->fb_tunnel_dev);
+       struct net_device *dev, *aux;
         int h;
  
+       for_each_netdev_safe(net, dev, aux)
+               if (dev->rtnl_link_ops == ops)
+                       unregister_netdevice_queue(dev, head);
+
         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
                 struct ip_tunnel *t;
                 struct hlist_node *n;
                 struct hlist_head *thead = &itn->tunnels[h];
  
                 hlist_for_each_entry_safe(t, n, thead, hash_node)
-                       unregister_netdevice_queue(t->dev, head);
+                       /* If dev is in the same netns, it has already
+                        * been added to the list by the previous loop.
+                        */
+                       if (!net_eq(dev_net(t->dev), net))
+                               unregister_netdevice_queue(t->dev, head);
         }
         if (itn->fb_tunnel_dev)
                 unregister_netdevice_queue(itn->fb_tunnel_dev, head);
  }
  
-void ip_tunnel_delete_net(struct ip_tunnel_net *itn)
+void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
  {
         LIST_HEAD(list);
  
         rtnl_lock();
-       ip_tunnel_destroy(itn, &list);
+       ip_tunnel_destroy(itn, &list, ops);
         unregister_netdevice_many(&list);
         rtnl_unlock();
-       kfree(itn->tunnels);
  }
  EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
  
@@ -926,6 +920,7 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
         if (ip_tunnel_find(itn, p, dev->type))
                 return -EEXIST;
  
+       nt->net = net;
         nt->parms = *p;
         err = register_netdevice(dev);
         if (err)
@@ -948,23 +943,21 @@ EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
  int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
                          struct ip_tunnel_parm *p)
  {
-       struct ip_tunnel *t, *nt;
-       struct net *net = dev_net(dev);
+       struct ip_tunnel *t;
         struct ip_tunnel *tunnel = netdev_priv(dev);
+       struct net *net = tunnel->net;
         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
  
         if (dev == itn->fb_tunnel_dev)
                 return -EINVAL;
  
-       nt = netdev_priv(dev);
-
         t = ip_tunnel_find(itn, p, dev->type);
  
         if (t) {
                 if (t->dev != dev)
                         return -EEXIST;
         } else {
-               t = nt;
+               t = tunnel;
  
                 if (dev->type != ARPHRD_ETHER) {
                         unsigned int nflags = 0;
@@ -1003,6 +996,7 @@ int ip_tunnel_init(struct net_device *dev)
         }
  
         tunnel->dev = dev;
+       tunnel->net = dev_net(dev);
         strcpy(tunnel->parms.name, dev->name);
         iph->version            = 4;
         iph->ihl                = 5;
@@ -1013,8 +1007,8 @@ EXPORT_SYMBOL_GPL(ip_tunnel_init);
  
  void ip_tunnel_uninit(struct net_device *dev)
  {
-       struct net *net = dev_net(dev);
         struct ip_tunnel *tunnel = netdev_priv(dev);
+       struct net *net = tunnel->net;
         struct ip_tunnel_net *itn;
  
         itn = net_generic(net, tunnel->ip_tnl_net_id);