Merge branch 'ip_forward_pmtu'

author David S. Miller <davem@davemloft.net>

Mon, 13 Jan 2014 19:23:02 +0000 (11:23 -0800)

committer David S. Miller <davem@davemloft.net>

Mon, 13 Jan 2014 19:23:02 +0000 (11:23 -0800)
author David S. Miller <davem@davemloft.net>
Mon, 13 Jan 2014 19:23:02 +0000 (11:23 -0800)
committer David S. Miller <davem@davemloft.net>
Mon, 13 Jan 2014 19:23:02 +0000 (11:23 -0800)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt

index 7373115407e489592d4717f814866c5c531e62fa..c97932c88ea3a39b8547d67df667061c9b677a30 100644 (file)
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -26,12 +26,36 @@ ip_no_pmtu_disc - INTEGER
         discarded. Outgoing frames are handled the same as in mode 1,
         implicitly setting IP_PMTUDISC_DONT on every created socket.
  
-       Possible values: 0-2
+       Mode 3 is a hardend pmtu discover mode. The kernel will only
+       accept fragmentation-needed errors if the underlying protocol
+       can verify them besides a plain socket lookup. Current
+       protocols for which pmtu events will be honored are TCP, SCTP
+       and DCCP as they verify e.g. the sequence number or the
+       association. This mode should not be enabled globally but is
+       only intended to secure e.g. name servers in namespaces where
+       TCP path mtu must still work but path MTU information of other
+       protocols should be discarded. If enabled globally this mode
+       could break other protocols.
+
+       Possible values: 0-3
         Default: FALSE
  
  min_pmtu - INTEGER
         default 552 - minimum discovered Path MTU
  
+ip_forward_use_pmtu - BOOLEAN
+       By default we don't trust protocol path MTUs while forwarding
+       because they could be easily forged and can lead to unwanted
+       fragmentation by the router.
+       You only need to enable this if you have user-space software
+       which tries to discover path mtus by itself and depends on the
+       kernel honoring this information. This is normally not the
+       case.
+       Default: 0 (disabled)
+       Possible values:
+       0 - disabled
+       1 - enabled
+
  route/max_size - INTEGER
         Maximum number of routes allowed in the kernel.  Increase
         this when using large numbers of interfaces and/or routes.
diff --git a/include/net/ip.h b/include/net/ip.h

index 535664477c4a0d339f6ba92051aa3bd9cd1f7dd5..0dab95c2e4d59f096f481456581b931718216fff 100644 (file)
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -263,6 +263,39 @@ int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
                  !(dst_metric_locked(dst, RTAX_MTU)));
  }
  
+static inline bool ip_sk_accept_pmtu(const struct sock *sk)
+{
+       return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE;
+}
+
+static inline bool ip_sk_use_pmtu(const struct sock *sk)
+{
+       return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
+}
+
+static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
+                                                   bool forwarding)
+{
+       struct net *net = dev_net(dst->dev);
+
+       if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
+           dst_metric_locked(dst, RTAX_MTU) ||
+           !forwarding)
+               return dst_mtu(dst);
+
+       return min(dst->dev->mtu, IP_MAX_MTU);
+}
+
+static inline unsigned int ip_skb_dst_mtu(const struct sk_buff *skb)
+{
+       if (!skb->sk || ip_sk_use_pmtu(skb->sk)) {
+               bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
+               return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding);
+       } else {
+               return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU);
+       }
+}
+
  void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
  
  static inline void ip_select_ident(struct sk_buff *skb, struct dst_entry *dst, struct sock *sk)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h

index 929a668e91a9a8b42acba7a1148cdf21c872f0e8..80f500a29498e1fc9b8892e5c66be6bd02362eaa 100644 (file)
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -70,6 +70,7 @@ struct netns_ipv4 {
  
         int sysctl_tcp_ecn;
         int sysctl_ip_no_pmtu_disc;
+       int sysctl_ip_fwd_use_pmtu;
  
         kgid_t sysctl_ping_group_range[2];
  
diff --git a/include/net/protocol.h b/include/net/protocol.h

index fbf7676c9a02e352890b66d0bc3caf1775dbb199..0e5f8665d7fbf651746ba46727eeda0365be6a47 100644 (file)
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -43,7 +43,12 @@ struct net_protocol {
         int                     (*handler)(struct sk_buff *skb);
         void                    (*err_handler)(struct sk_buff *skb, u32 info);
         unsigned int            no_policy:1,
-                               netns_ok:1;
+                               netns_ok:1,
+                               /* does the protocol do more stringent
+                                * icmp tag validation than simple
+                                * socket lookup?
+                                */
+                               icmp_strict_tag_validation:1;
  };
  
  #if IS_ENABLED(CONFIG_IPV6)
diff --git a/include/net/route.h b/include/net/route.h

index 638e3ebf76f33443ac3c218912c7b7646a4f7846..9d1f423d5944bc3908bf68e64aae7f259ff7fdb0 100644 (file)
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -36,6 +36,9 @@
  #include <linux/cache.h>
  #include <linux/security.h>
  
+/* IPv4 datagram length is stored into 16bit field (tot_len) */
+#define IP_MAX_MTU     0xFFFFU
+
  #define RTO_ONLINK     0x01
  
  #define RT_CONN_FLAGS(sk)   (RT_TOS(inet_sk(sk)->tos) | sock_flag(sk, SOCK_LOCALROUTE))
@@ -311,20 +314,4 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
         return hoplimit;
  }
  
-static inline bool ip_sk_accept_pmtu(const struct sock *sk)
-{
-       return inet_sk(sk)->pmtudisc != IP_PMTUDISC_INTERFACE;
-}
-
-static inline bool ip_sk_use_pmtu(const struct sock *sk)
-{
-       return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE;
-}
-
-static inline int ip_skb_dst_mtu(const struct sk_buff *skb)
-{
-       return (!skb->sk || ip_sk_use_pmtu(skb->sk)) ?
-              dst_mtu(skb_dst(skb)) : skb_dst(skb)->dev->mtu;
-}
-
  #endif /* _ROUTE_H */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c

index 88299c29101db7f8b52dba800ec19087b495f7dd..22b5d818b2001b177b765cbb67eb2551e87502ce 100644 (file)
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -989,6 +989,7 @@ static const struct net_protocol dccp_v4_protocol = {
         .err_handler    = dccp_v4_err,
         .no_policy      = 1,
         .netns_ok       = 1,
+       .icmp_strict_tag_validation = 1,
  };
  
  static const struct proto_ops inet_dccp_ops = {
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c

index 6268a4751e641a764701d22ca9e717bbe31fae2c..ecd2c3f245ce2b2e0b79f17417c5e6ad8c70abf6 100644 (file)
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1545,6 +1545,7 @@ static const struct net_protocol tcp_protocol = {
         .err_handler    =       tcp_v4_err,
         .no_policy      =       1,
         .netns_ok       =       1,
+       .icmp_strict_tag_validation = 1,
  };
  
  static const struct net_protocol udp_protocol = {
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c

index fb3c5637199dbfacd7b7b5c40ae14b1b2880d178..0134663fdbce86f6da39d8f6c9d27ce6c404687c 100644 (file)
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -668,6 +668,16 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
         rcu_read_unlock();
  }
  
+static bool icmp_tag_validation(int proto)
+{
+       bool ok;
+
+       rcu_read_lock();
+       ok = rcu_dereference(inet_protos[proto])->icmp_strict_tag_validation;
+       rcu_read_unlock();
+       return ok;
+}
+
  /*
   *     Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and
   *     ICMP_PARAMETERPROB.
@@ -705,12 +715,22 @@ static void icmp_unreach(struct sk_buff *skb)
                 case ICMP_PORT_UNREACH:
                         break;
                 case ICMP_FRAG_NEEDED:
-                       if (net->ipv4.sysctl_ip_no_pmtu_disc == 2) {
-                               goto out;
-                       } else if (net->ipv4.sysctl_ip_no_pmtu_disc) {
+                       /* for documentation of the ip_no_pmtu_disc
+                        * values please see
+                        * Documentation/networking/ip-sysctl.txt
+                        */
+                       switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
+                       default:
                                 LIMIT_NETDEBUG(KERN_INFO pr_fmt("%pI4: fragmentation needed and DF set\n"),
                                                &iph->daddr);
-                       } else {
+                               break;
+                       case 2:
+                               goto out;
+                       case 3:
+                               if (!icmp_tag_validation(iph->protocol))
+                                       goto out;
+                               /* fall through */
+                       case 0:
                                 info = ntohs(icmph->un.frag.mtu);
                                 if (!info)
                                         goto out;
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c

index 694de3b7aebfede6073433201e1200cf72008997..e9f1217a8afdaf2559ce3fd7d134489994faf440 100644 (file)
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -54,6 +54,7 @@ static int ip_forward_finish(struct sk_buff *skb)
  
  int ip_forward(struct sk_buff *skb)
  {
+       u32 mtu;
         struct iphdr *iph;      /* Our header */
         struct rtable *rt;      /* Route we use */
         struct ip_options *opt  = &(IPCB(skb)->opt);
@@ -88,11 +89,13 @@ int ip_forward(struct sk_buff *skb)
         if (opt->is_strictroute && rt->rt_uses_gateway)
                 goto sr_failed;
  
-       if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
+       IPCB(skb)->flags |= IPSKB_FORWARDED;
+       mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+       if (unlikely(skb->len > mtu && !skb_is_gso(skb) &&
                      (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
                 IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-                         htonl(dst_mtu(&rt->dst)));
+                         htonl(mtu));
                 goto drop;
         }
  
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c

index df184616493f707ea8a0ea353a33d3a5aa981fc7..9a78804cfe9c457a94c3b4c5f2e7d8fe0eb604dd 100644 (file)
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -449,6 +449,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
         __be16 not_last_frag;
         struct rtable *rt = skb_rtable(skb);
         int err = 0;
+       bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED;
  
         dev = rt->dst.dev;
  
@@ -458,12 +459,13 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
  
         iph = ip_hdr(skb);
  
+       mtu = ip_dst_mtu_maybe_forward(&rt->dst, forwarding);
         if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->local_df) ||
                      (IPCB(skb)->frag_max_size &&
-                     IPCB(skb)->frag_max_size > dst_mtu(&rt->dst)))) {
+                     IPCB(skb)->frag_max_size > mtu))) {
                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
-                         htonl(ip_skb_dst_mtu(skb)));
+                         htonl(mtu));
                 kfree_skb(skb);
                 return -EMSGSIZE;
         }
@@ -473,7 +475,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
          */
  
         hlen = iph->ihl * 4;
-       mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
+       mtu = mtu - hlen;       /* Size of data space */
  #ifdef CONFIG_BRIDGE_NETFILTER
         if (skb->nf_bridge)
                 mtu -= nf_bridge_mtu_reduction(skb);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c

index f8da28278014ee2c99b2e530a694151d329bc5c2..25071b48921cebc4788a1f4b0b5fa118832f5910 100644 (file)
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,9 +112,6 @@
  #define RT_FL_TOS(oldflp4) \
         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
  
-/* IPv4 datagram length is stored into 16bit field (tot_len) */
-#define IP_MAX_MTU     0xFFFF
-
  #define RT_GC_TIMEOUT (300*HZ)
  
  static int ip_rt_max_size;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 1d2480ac2bb6979921f954e476464f481ea94449..44eba052b43d3ab49ba7630bcd82e73e5b094472 100644 (file)
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -831,6 +831,13 @@ static struct ctl_table ipv4_net_table[] = {
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec
         },
+       {
+               .procname       = "ip_forward_use_pmtu",
+               .data           = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
         { }
  };
  
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c

index d1de9560c4217925b19666d811bb17378e550a39..ef02b26ccf812e57e794e9b151746600bcb9a8f0 100644 (file)
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -321,6 +321,27 @@ static inline int ip6_forward_finish(struct sk_buff *skb)
         return dst_output(skb);
  }
  
+static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+{
+       unsigned int mtu;
+       struct inet6_dev *idev;
+
+       if (dst_metric_locked(dst, RTAX_MTU)) {
+               mtu = dst_metric_raw(dst, RTAX_MTU);
+               if (mtu)
+                       return mtu;
+       }
+
+       mtu = IPV6_MIN_MTU;
+       rcu_read_lock();
+       idev = __in6_dev_get(dst->dev);
+       if (idev)
+               mtu = idev->cnf.mtu6;
+       rcu_read_unlock();
+
+       return mtu;
+}
+
  int ip6_forward(struct sk_buff *skb)
  {
         struct dst_entry *dst = skb_dst(skb);
@@ -441,7 +462,7 @@ int ip6_forward(struct sk_buff *skb)
                 }
         }
  
-       mtu = dst_mtu(dst);
+       mtu = ip6_dst_mtu_forward(dst);
         if (mtu < IPV6_MIN_MTU)
                 mtu = IPV6_MIN_MTU;
  
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c

index 34b7726bcd7ff472da4603fea6010bc39f467440..7c161084f2414b61abbf10a35937d15e460d9f66 100644 (file)
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1030,6 +1030,7 @@ static const struct net_protocol sctp_protocol = {
         .err_handler = sctp_v4_err,
         .no_policy   = 1,
         .netns_ok    = 1,
+       .icmp_strict_tag_validation = 1,
  };
  
  /* IPv4 address related functions.  */
author	David S. Miller <davem@davemloft.net>
	Mon, 13 Jan 2014 19:23:02 +0000 (11:23 -0800)
committer	David S. Miller <davem@davemloft.net>
	Mon, 13 Jan 2014 19:23:02 +0000 (11:23 -0800)
Documentation/networking/ip-sysctl.txt		patch \| blob \| history
include/net/ip.h		patch \| blob \| history
include/net/netns/ipv4.h		patch \| blob \| history
include/net/protocol.h		patch \| blob \| history
include/net/route.h		patch \| blob \| history
net/dccp/ipv4.c		patch \| blob \| history
net/ipv4/af_inet.c		patch \| blob \| history
net/ipv4/icmp.c		patch \| blob \| history
net/ipv4/ip_forward.c		patch \| blob \| history
net/ipv4/ip_output.c		patch \| blob \| history
net/ipv4/route.c		patch \| blob \| history
net/ipv4/sysctl_net_ipv4.c		patch \| blob \| history
net/ipv6/ip6_output.c		patch \| blob \| history
net/sctp/protocol.c		patch \| blob \| history