]> Pileus Git - ~andy/linux/blob - net/ipv4/tcp_metrics.c
Linux 3.14
[~andy/linux] / net / ipv4 / tcp_metrics.c
1 #include <linux/rcupdate.h>
2 #include <linux/spinlock.h>
3 #include <linux/jiffies.h>
4 #include <linux/module.h>
5 #include <linux/cache.h>
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 #include <linux/tcp.h>
9 #include <linux/hash.h>
10 #include <linux/tcp_metrics.h>
11 #include <linux/vmalloc.h>
12
13 #include <net/inet_connection_sock.h>
14 #include <net/net_namespace.h>
15 #include <net/request_sock.h>
16 #include <net/inetpeer.h>
17 #include <net/sock.h>
18 #include <net/ipv6.h>
19 #include <net/dst.h>
20 #include <net/tcp.h>
21 #include <net/genetlink.h>
22
23 int sysctl_tcp_nometrics_save __read_mostly;
24
25 static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
26                                                    const struct inetpeer_addr *daddr,
27                                                    struct net *net, unsigned int hash);
28
29 struct tcp_fastopen_metrics {
30         u16     mss;
31         u16     syn_loss:10;            /* Recurring Fast Open SYN losses */
32         unsigned long   last_syn_loss;  /* Last Fast Open SYN loss */
33         struct  tcp_fastopen_cookie     cookie;
34 };
35
36 struct tcp_metrics_block {
37         struct tcp_metrics_block __rcu  *tcpm_next;
38         struct inetpeer_addr            tcpm_saddr;
39         struct inetpeer_addr            tcpm_daddr;
40         unsigned long                   tcpm_stamp;
41         u32                             tcpm_ts;
42         u32                             tcpm_ts_stamp;
43         u32                             tcpm_lock;
44         u32                             tcpm_vals[TCP_METRIC_MAX + 1];
45         struct tcp_fastopen_metrics     tcpm_fastopen;
46
47         struct rcu_head                 rcu_head;
48 };
49
50 static bool tcp_metric_locked(struct tcp_metrics_block *tm,
51                               enum tcp_metric_index idx)
52 {
53         return tm->tcpm_lock & (1 << idx);
54 }
55
56 static u32 tcp_metric_get(struct tcp_metrics_block *tm,
57                           enum tcp_metric_index idx)
58 {
59         return tm->tcpm_vals[idx];
60 }
61
62 static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
63                                   enum tcp_metric_index idx)
64 {
65         return msecs_to_jiffies(tm->tcpm_vals[idx]);
66 }
67
68 static void tcp_metric_set(struct tcp_metrics_block *tm,
69                            enum tcp_metric_index idx,
70                            u32 val)
71 {
72         tm->tcpm_vals[idx] = val;
73 }
74
75 static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
76                                  enum tcp_metric_index idx,
77                                  u32 val)
78 {
79         tm->tcpm_vals[idx] = jiffies_to_msecs(val);
80 }
81
82 static bool addr_same(const struct inetpeer_addr *a,
83                       const struct inetpeer_addr *b)
84 {
85         const struct in6_addr *a6, *b6;
86
87         if (a->family != b->family)
88                 return false;
89         if (a->family == AF_INET)
90                 return a->addr.a4 == b->addr.a4;
91
92         a6 = (const struct in6_addr *) &a->addr.a6[0];
93         b6 = (const struct in6_addr *) &b->addr.a6[0];
94
95         return ipv6_addr_equal(a6, b6);
96 }
97
98 struct tcpm_hash_bucket {
99         struct tcp_metrics_block __rcu  *chain;
100 };
101
102 static DEFINE_SPINLOCK(tcp_metrics_lock);
103
104 static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
105                           bool fastopen_clear)
106 {
107         u32 val;
108
109         tm->tcpm_stamp = jiffies;
110
111         val = 0;
112         if (dst_metric_locked(dst, RTAX_RTT))
113                 val |= 1 << TCP_METRIC_RTT;
114         if (dst_metric_locked(dst, RTAX_RTTVAR))
115                 val |= 1 << TCP_METRIC_RTTVAR;
116         if (dst_metric_locked(dst, RTAX_SSTHRESH))
117                 val |= 1 << TCP_METRIC_SSTHRESH;
118         if (dst_metric_locked(dst, RTAX_CWND))
119                 val |= 1 << TCP_METRIC_CWND;
120         if (dst_metric_locked(dst, RTAX_REORDERING))
121                 val |= 1 << TCP_METRIC_REORDERING;
122         tm->tcpm_lock = val;
123
124         tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
125         tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
126         tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
127         tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
128         tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
129         tm->tcpm_ts = 0;
130         tm->tcpm_ts_stamp = 0;
131         if (fastopen_clear) {
132                 tm->tcpm_fastopen.mss = 0;
133                 tm->tcpm_fastopen.syn_loss = 0;
134                 tm->tcpm_fastopen.cookie.len = 0;
135         }
136 }
137
138 #define TCP_METRICS_TIMEOUT             (60 * 60 * HZ)
139
140 static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
141 {
142         if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
143                 tcpm_suck_dst(tm, dst, false);
144 }
145
146 #define TCP_METRICS_RECLAIM_DEPTH       5
147 #define TCP_METRICS_RECLAIM_PTR         (struct tcp_metrics_block *) 0x1UL
148
149 static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
150                                           struct inetpeer_addr *saddr,
151                                           struct inetpeer_addr *daddr,
152                                           unsigned int hash)
153 {
154         struct tcp_metrics_block *tm;
155         struct net *net;
156         bool reclaim = false;
157
158         spin_lock_bh(&tcp_metrics_lock);
159         net = dev_net(dst->dev);
160
161         /* While waiting for the spin-lock the cache might have been populated
162          * with this entry and so we have to check again.
163          */
164         tm = __tcp_get_metrics(saddr, daddr, net, hash);
165         if (tm == TCP_METRICS_RECLAIM_PTR) {
166                 reclaim = true;
167                 tm = NULL;
168         }
169         if (tm) {
170                 tcpm_check_stamp(tm, dst);
171                 goto out_unlock;
172         }
173
174         if (unlikely(reclaim)) {
175                 struct tcp_metrics_block *oldest;
176
177                 oldest = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain);
178                 for (tm = rcu_dereference(oldest->tcpm_next); tm;
179                      tm = rcu_dereference(tm->tcpm_next)) {
180                         if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
181                                 oldest = tm;
182                 }
183                 tm = oldest;
184         } else {
185                 tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
186                 if (!tm)
187                         goto out_unlock;
188         }
189         tm->tcpm_saddr = *saddr;
190         tm->tcpm_daddr = *daddr;
191
192         tcpm_suck_dst(tm, dst, true);
193
194         if (likely(!reclaim)) {
195                 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
196                 rcu_assign_pointer(net->ipv4.tcp_metrics_hash[hash].chain, tm);
197         }
198
199 out_unlock:
200         spin_unlock_bh(&tcp_metrics_lock);
201         return tm;
202 }
203
204 static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth)
205 {
206         if (tm)
207                 return tm;
208         if (depth > TCP_METRICS_RECLAIM_DEPTH)
209                 return TCP_METRICS_RECLAIM_PTR;
210         return NULL;
211 }
212
213 static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr,
214                                                    const struct inetpeer_addr *daddr,
215                                                    struct net *net, unsigned int hash)
216 {
217         struct tcp_metrics_block *tm;
218         int depth = 0;
219
220         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
221              tm = rcu_dereference(tm->tcpm_next)) {
222                 if (addr_same(&tm->tcpm_saddr, saddr) &&
223                     addr_same(&tm->tcpm_daddr, daddr))
224                         break;
225                 depth++;
226         }
227         return tcp_get_encode(tm, depth);
228 }
229
230 static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
231                                                        struct dst_entry *dst)
232 {
233         struct tcp_metrics_block *tm;
234         struct inetpeer_addr saddr, daddr;
235         unsigned int hash;
236         struct net *net;
237
238         saddr.family = req->rsk_ops->family;
239         daddr.family = req->rsk_ops->family;
240         switch (daddr.family) {
241         case AF_INET:
242                 saddr.addr.a4 = inet_rsk(req)->ir_loc_addr;
243                 daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr;
244                 hash = (__force unsigned int) daddr.addr.a4;
245                 break;
246 #if IS_ENABLED(CONFIG_IPV6)
247         case AF_INET6:
248                 *(struct in6_addr *)saddr.addr.a6 = inet_rsk(req)->ir_v6_loc_addr;
249                 *(struct in6_addr *)daddr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr;
250                 hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr);
251                 break;
252 #endif
253         default:
254                 return NULL;
255         }
256
257         net = dev_net(dst->dev);
258         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
259
260         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
261              tm = rcu_dereference(tm->tcpm_next)) {
262                 if (addr_same(&tm->tcpm_saddr, &saddr) &&
263                     addr_same(&tm->tcpm_daddr, &daddr))
264                         break;
265         }
266         tcpm_check_stamp(tm, dst);
267         return tm;
268 }
269
270 static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw)
271 {
272         struct tcp_metrics_block *tm;
273         struct inetpeer_addr saddr, daddr;
274         unsigned int hash;
275         struct net *net;
276
277         if (tw->tw_family == AF_INET) {
278                 saddr.family = AF_INET;
279                 saddr.addr.a4 = tw->tw_rcv_saddr;
280                 daddr.family = AF_INET;
281                 daddr.addr.a4 = tw->tw_daddr;
282                 hash = (__force unsigned int) daddr.addr.a4;
283         }
284 #if IS_ENABLED(CONFIG_IPV6)
285         else if (tw->tw_family == AF_INET6) {
286                 if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) {
287                         saddr.family = AF_INET;
288                         saddr.addr.a4 = tw->tw_rcv_saddr;
289                         daddr.family = AF_INET;
290                         daddr.addr.a4 = tw->tw_daddr;
291                         hash = (__force unsigned int) daddr.addr.a4;
292                 } else {
293                         saddr.family = AF_INET6;
294                         *(struct in6_addr *)saddr.addr.a6 = tw->tw_v6_rcv_saddr;
295                         daddr.family = AF_INET6;
296                         *(struct in6_addr *)daddr.addr.a6 = tw->tw_v6_daddr;
297                         hash = ipv6_addr_hash(&tw->tw_v6_daddr);
298                 }
299         }
300 #endif
301         else
302                 return NULL;
303
304         net = twsk_net(tw);
305         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
306
307         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
308              tm = rcu_dereference(tm->tcpm_next)) {
309                 if (addr_same(&tm->tcpm_saddr, &saddr) &&
310                     addr_same(&tm->tcpm_daddr, &daddr))
311                         break;
312         }
313         return tm;
314 }
315
316 static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
317                                                  struct dst_entry *dst,
318                                                  bool create)
319 {
320         struct tcp_metrics_block *tm;
321         struct inetpeer_addr saddr, daddr;
322         unsigned int hash;
323         struct net *net;
324
325         if (sk->sk_family == AF_INET) {
326                 saddr.family = AF_INET;
327                 saddr.addr.a4 = inet_sk(sk)->inet_saddr;
328                 daddr.family = AF_INET;
329                 daddr.addr.a4 = inet_sk(sk)->inet_daddr;
330                 hash = (__force unsigned int) daddr.addr.a4;
331         }
332 #if IS_ENABLED(CONFIG_IPV6)
333         else if (sk->sk_family == AF_INET6) {
334                 if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) {
335                         saddr.family = AF_INET;
336                         saddr.addr.a4 = inet_sk(sk)->inet_saddr;
337                         daddr.family = AF_INET;
338                         daddr.addr.a4 = inet_sk(sk)->inet_daddr;
339                         hash = (__force unsigned int) daddr.addr.a4;
340                 } else {
341                         saddr.family = AF_INET6;
342                         *(struct in6_addr *)saddr.addr.a6 = sk->sk_v6_rcv_saddr;
343                         daddr.family = AF_INET6;
344                         *(struct in6_addr *)daddr.addr.a6 = sk->sk_v6_daddr;
345                         hash = ipv6_addr_hash(&sk->sk_v6_daddr);
346                 }
347         }
348 #endif
349         else
350                 return NULL;
351
352         net = dev_net(dst->dev);
353         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
354
355         tm = __tcp_get_metrics(&saddr, &daddr, net, hash);
356         if (tm == TCP_METRICS_RECLAIM_PTR)
357                 tm = NULL;
358         if (!tm && create)
359                 tm = tcpm_new(dst, &saddr, &daddr, hash);
360         else
361                 tcpm_check_stamp(tm, dst);
362
363         return tm;
364 }
365
366 /* Save metrics learned by this TCP session.  This function is called
367  * only, when TCP finishes successfully i.e. when it enters TIME-WAIT
368  * or goes from LAST-ACK to CLOSE.
369  */
370 void tcp_update_metrics(struct sock *sk)
371 {
372         const struct inet_connection_sock *icsk = inet_csk(sk);
373         struct dst_entry *dst = __sk_dst_get(sk);
374         struct tcp_sock *tp = tcp_sk(sk);
375         struct tcp_metrics_block *tm;
376         unsigned long rtt;
377         u32 val;
378         int m;
379
380         if (sysctl_tcp_nometrics_save || !dst)
381                 return;
382
383         if (dst->flags & DST_HOST)
384                 dst_confirm(dst);
385
386         rcu_read_lock();
387         if (icsk->icsk_backoff || !tp->srtt) {
388                 /* This session failed to estimate rtt. Why?
389                  * Probably, no packets returned in time.  Reset our
390                  * results.
391                  */
392                 tm = tcp_get_metrics(sk, dst, false);
393                 if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT))
394                         tcp_metric_set(tm, TCP_METRIC_RTT, 0);
395                 goto out_unlock;
396         } else
397                 tm = tcp_get_metrics(sk, dst, true);
398
399         if (!tm)
400                 goto out_unlock;
401
402         rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
403         m = rtt - tp->srtt;
404
405         /* If newly calculated rtt larger than stored one, store new
406          * one. Otherwise, use EWMA. Remember, rtt overestimation is
407          * always better than underestimation.
408          */
409         if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
410                 if (m <= 0)
411                         rtt = tp->srtt;
412                 else
413                         rtt -= (m >> 3);
414                 tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
415         }
416
417         if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
418                 unsigned long var;
419
420                 if (m < 0)
421                         m = -m;
422
423                 /* Scale deviation to rttvar fixed point */
424                 m >>= 1;
425                 if (m < tp->mdev)
426                         m = tp->mdev;
427
428                 var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
429                 if (m >= var)
430                         var = m;
431                 else
432                         var -= (var - m) >> 2;
433
434                 tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
435         }
436
437         if (tcp_in_initial_slowstart(tp)) {
438                 /* Slow start still did not finish. */
439                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
440                         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
441                         if (val && (tp->snd_cwnd >> 1) > val)
442                                 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
443                                                tp->snd_cwnd >> 1);
444                 }
445                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
446                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
447                         if (tp->snd_cwnd > val)
448                                 tcp_metric_set(tm, TCP_METRIC_CWND,
449                                                tp->snd_cwnd);
450                 }
451         } else if (tp->snd_cwnd > tp->snd_ssthresh &&
452                    icsk->icsk_ca_state == TCP_CA_Open) {
453                 /* Cong. avoidance phase, cwnd is reliable. */
454                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
455                         tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
456                                        max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
457                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
458                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
459                         tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
460                 }
461         } else {
462                 /* Else slow start did not finish, cwnd is non-sense,
463                  * ssthresh may be also invalid.
464                  */
465                 if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
466                         val = tcp_metric_get(tm, TCP_METRIC_CWND);
467                         tcp_metric_set(tm, TCP_METRIC_CWND,
468                                        (val + tp->snd_ssthresh) >> 1);
469                 }
470                 if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
471                         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
472                         if (val && tp->snd_ssthresh > val)
473                                 tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
474                                                tp->snd_ssthresh);
475                 }
476                 if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
477                         val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
478                         if (val < tp->reordering &&
479                             tp->reordering != sysctl_tcp_reordering)
480                                 tcp_metric_set(tm, TCP_METRIC_REORDERING,
481                                                tp->reordering);
482                 }
483         }
484         tm->tcpm_stamp = jiffies;
485 out_unlock:
486         rcu_read_unlock();
487 }
488
489 /* Initialize metrics on socket. */
490
491 void tcp_init_metrics(struct sock *sk)
492 {
493         struct dst_entry *dst = __sk_dst_get(sk);
494         struct tcp_sock *tp = tcp_sk(sk);
495         struct tcp_metrics_block *tm;
496         u32 val, crtt = 0; /* cached RTT scaled by 8 */
497
498         if (dst == NULL)
499                 goto reset;
500
501         dst_confirm(dst);
502
503         rcu_read_lock();
504         tm = tcp_get_metrics(sk, dst, true);
505         if (!tm) {
506                 rcu_read_unlock();
507                 goto reset;
508         }
509
510         if (tcp_metric_locked(tm, TCP_METRIC_CWND))
511                 tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
512
513         val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
514         if (val) {
515                 tp->snd_ssthresh = val;
516                 if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
517                         tp->snd_ssthresh = tp->snd_cwnd_clamp;
518         } else {
519                 /* ssthresh may have been reduced unnecessarily during.
520                  * 3WHS. Restore it back to its initial default.
521                  */
522                 tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
523         }
524         val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
525         if (val && tp->reordering != val) {
526                 tcp_disable_fack(tp);
527                 tcp_disable_early_retrans(tp);
528                 tp->reordering = val;
529         }
530
531         crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
532         rcu_read_unlock();
533 reset:
534         /* The initial RTT measurement from the SYN/SYN-ACK is not ideal
535          * to seed the RTO for later data packets because SYN packets are
536          * small. Use the per-dst cached values to seed the RTO but keep
537          * the RTT estimator variables intact (e.g., srtt, mdev, rttvar).
538          * Later the RTO will be updated immediately upon obtaining the first
539          * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only
540          * influences the first RTO but not later RTT estimation.
541          *
542          * But if RTT is not available from the SYN (due to retransmits or
543          * syn cookies) or the cache, force a conservative 3secs timeout.
544          *
545          * A bit of theory. RTT is time passed after "normal" sized packet
546          * is sent until it is ACKed. In normal circumstances sending small
547          * packets force peer to delay ACKs and calculation is correct too.
548          * The algorithm is adaptive and, provided we follow specs, it
549          * NEVER underestimate RTT. BUT! If peer tries to make some clever
550          * tricks sort of "quick acks" for time long enough to decrease RTT
551          * to low value, and then abruptly stops to do it and starts to delay
552          * ACKs, wait for troubles.
553          */
554         if (crtt > tp->srtt) {
555                 /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
556                 crtt >>= 3;
557                 inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
558         } else if (tp->srtt == 0) {
559                 /* RFC6298: 5.7 We've failed to get a valid RTT sample from
560                  * 3WHS. This is most likely due to retransmission,
561                  * including spurious one. Reset the RTO back to 3secs
562                  * from the more aggressive 1sec to avoid more spurious
563                  * retransmission.
564                  */
565                 tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
566                 inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
567         }
568         /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
569          * retransmitted. In light of RFC6298 more aggressive 1sec
570          * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
571          * retransmission has occurred.
572          */
573         if (tp->total_retrans > 1)
574                 tp->snd_cwnd = 1;
575         else
576                 tp->snd_cwnd = tcp_init_cwnd(tp, dst);
577         tp->snd_cwnd_stamp = tcp_time_stamp;
578 }
579
580 bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, bool paws_check)
581 {
582         struct tcp_metrics_block *tm;
583         bool ret;
584
585         if (!dst)
586                 return false;
587
588         rcu_read_lock();
589         tm = __tcp_get_metrics_req(req, dst);
590         if (paws_check) {
591                 if (tm &&
592                     (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL &&
593                     (s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW)
594                         ret = false;
595                 else
596                         ret = true;
597         } else {
598                 if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp)
599                         ret = true;
600                 else
601                         ret = false;
602         }
603         rcu_read_unlock();
604
605         return ret;
606 }
607 EXPORT_SYMBOL_GPL(tcp_peer_is_proven);
608
609 void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst)
610 {
611         struct tcp_metrics_block *tm;
612
613         rcu_read_lock();
614         tm = tcp_get_metrics(sk, dst, true);
615         if (tm) {
616                 struct tcp_sock *tp = tcp_sk(sk);
617
618                 if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) {
619                         tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp;
620                         tp->rx_opt.ts_recent = tm->tcpm_ts;
621                 }
622         }
623         rcu_read_unlock();
624 }
625 EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp);
626
627 /* VJ's idea. Save last timestamp seen from this destination and hold
628  * it at least for normal timewait interval to use for duplicate
629  * segment detection in subsequent connections, before they enter
630  * synchronized state.
631  */
632 bool tcp_remember_stamp(struct sock *sk)
633 {
634         struct dst_entry *dst = __sk_dst_get(sk);
635         bool ret = false;
636
637         if (dst) {
638                 struct tcp_metrics_block *tm;
639
640                 rcu_read_lock();
641                 tm = tcp_get_metrics(sk, dst, true);
642                 if (tm) {
643                         struct tcp_sock *tp = tcp_sk(sk);
644
645                         if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 ||
646                             ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
647                              tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
648                                 tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
649                                 tm->tcpm_ts = tp->rx_opt.ts_recent;
650                         }
651                         ret = true;
652                 }
653                 rcu_read_unlock();
654         }
655         return ret;
656 }
657
658 bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw)
659 {
660         struct tcp_metrics_block *tm;
661         bool ret = false;
662
663         rcu_read_lock();
664         tm = __tcp_get_metrics_tw(tw);
665         if (tm) {
666                 const struct tcp_timewait_sock *tcptw;
667                 struct sock *sk = (struct sock *) tw;
668
669                 tcptw = tcp_twsk(sk);
670                 if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 ||
671                     ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL &&
672                      tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
673                         tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
674                         tm->tcpm_ts        = tcptw->tw_ts_recent;
675                 }
676                 ret = true;
677         }
678         rcu_read_unlock();
679
680         return ret;
681 }
682
683 static DEFINE_SEQLOCK(fastopen_seqlock);
684
685 void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
686                             struct tcp_fastopen_cookie *cookie,
687                             int *syn_loss, unsigned long *last_syn_loss)
688 {
689         struct tcp_metrics_block *tm;
690
691         rcu_read_lock();
692         tm = tcp_get_metrics(sk, __sk_dst_get(sk), false);
693         if (tm) {
694                 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
695                 unsigned int seq;
696
697                 do {
698                         seq = read_seqbegin(&fastopen_seqlock);
699                         if (tfom->mss)
700                                 *mss = tfom->mss;
701                         *cookie = tfom->cookie;
702                         *syn_loss = tfom->syn_loss;
703                         *last_syn_loss = *syn_loss ? tfom->last_syn_loss : 0;
704                 } while (read_seqretry(&fastopen_seqlock, seq));
705         }
706         rcu_read_unlock();
707 }
708
709 void tcp_fastopen_cache_set(struct sock *sk, u16 mss,
710                             struct tcp_fastopen_cookie *cookie, bool syn_lost)
711 {
712         struct dst_entry *dst = __sk_dst_get(sk);
713         struct tcp_metrics_block *tm;
714
715         if (!dst)
716                 return;
717         rcu_read_lock();
718         tm = tcp_get_metrics(sk, dst, true);
719         if (tm) {
720                 struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen;
721
722                 write_seqlock_bh(&fastopen_seqlock);
723                 if (mss)
724                         tfom->mss = mss;
725                 if (cookie && cookie->len > 0)
726                         tfom->cookie = *cookie;
727                 if (syn_lost) {
728                         ++tfom->syn_loss;
729                         tfom->last_syn_loss = jiffies;
730                 } else
731                         tfom->syn_loss = 0;
732                 write_sequnlock_bh(&fastopen_seqlock);
733         }
734         rcu_read_unlock();
735 }
736
737 static struct genl_family tcp_metrics_nl_family = {
738         .id             = GENL_ID_GENERATE,
739         .hdrsize        = 0,
740         .name           = TCP_METRICS_GENL_NAME,
741         .version        = TCP_METRICS_GENL_VERSION,
742         .maxattr        = TCP_METRICS_ATTR_MAX,
743         .netnsok        = true,
744 };
745
746 static struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
747         [TCP_METRICS_ATTR_ADDR_IPV4]    = { .type = NLA_U32, },
748         [TCP_METRICS_ATTR_ADDR_IPV6]    = { .type = NLA_BINARY,
749                                             .len = sizeof(struct in6_addr), },
750         /* Following attributes are not received for GET/DEL,
751          * we keep them for reference
752          */
753 #if 0
754         [TCP_METRICS_ATTR_AGE]          = { .type = NLA_MSECS, },
755         [TCP_METRICS_ATTR_TW_TSVAL]     = { .type = NLA_U32, },
756         [TCP_METRICS_ATTR_TW_TS_STAMP]  = { .type = NLA_S32, },
757         [TCP_METRICS_ATTR_VALS]         = { .type = NLA_NESTED, },
758         [TCP_METRICS_ATTR_FOPEN_MSS]    = { .type = NLA_U16, },
759         [TCP_METRICS_ATTR_FOPEN_SYN_DROPS]      = { .type = NLA_U16, },
760         [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS]    = { .type = NLA_MSECS, },
761         [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY,
762                                             .len = TCP_FASTOPEN_COOKIE_MAX, },
763 #endif
764 };
765
766 /* Add attributes, caller cancels its header on failure */
767 static int tcp_metrics_fill_info(struct sk_buff *msg,
768                                  struct tcp_metrics_block *tm)
769 {
770         struct nlattr *nest;
771         int i;
772
773         switch (tm->tcpm_daddr.family) {
774         case AF_INET:
775                 if (nla_put_be32(msg, TCP_METRICS_ATTR_ADDR_IPV4,
776                                 tm->tcpm_daddr.addr.a4) < 0)
777                         goto nla_put_failure;
778                 if (nla_put_be32(msg, TCP_METRICS_ATTR_SADDR_IPV4,
779                                 tm->tcpm_saddr.addr.a4) < 0)
780                         goto nla_put_failure;
781                 break;
782         case AF_INET6:
783                 if (nla_put(msg, TCP_METRICS_ATTR_ADDR_IPV6, 16,
784                             tm->tcpm_daddr.addr.a6) < 0)
785                         goto nla_put_failure;
786                 if (nla_put(msg, TCP_METRICS_ATTR_SADDR_IPV6, 16,
787                             tm->tcpm_saddr.addr.a6) < 0)
788                         goto nla_put_failure;
789                 break;
790         default:
791                 return -EAFNOSUPPORT;
792         }
793
794         if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
795                           jiffies - tm->tcpm_stamp) < 0)
796                 goto nla_put_failure;
797         if (tm->tcpm_ts_stamp) {
798                 if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
799                                 (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0)
800                         goto nla_put_failure;
801                 if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL,
802                                 tm->tcpm_ts) < 0)
803                         goto nla_put_failure;
804         }
805
806         {
807                 int n = 0;
808
809                 nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
810                 if (!nest)
811                         goto nla_put_failure;
812                 for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
813                         if (!tm->tcpm_vals[i])
814                                 continue;
815                         if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
816                                 goto nla_put_failure;
817                         n++;
818                 }
819                 if (n)
820                         nla_nest_end(msg, nest);
821                 else
822                         nla_nest_cancel(msg, nest);
823         }
824
825         {
826                 struct tcp_fastopen_metrics tfom_copy[1], *tfom;
827                 unsigned int seq;
828
829                 do {
830                         seq = read_seqbegin(&fastopen_seqlock);
831                         tfom_copy[0] = tm->tcpm_fastopen;
832                 } while (read_seqretry(&fastopen_seqlock, seq));
833
834                 tfom = tfom_copy;
835                 if (tfom->mss &&
836                     nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS,
837                                 tfom->mss) < 0)
838                         goto nla_put_failure;
839                 if (tfom->syn_loss &&
840                     (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
841                                 tfom->syn_loss) < 0 ||
842                      nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
843                                 jiffies - tfom->last_syn_loss) < 0))
844                         goto nla_put_failure;
845                 if (tfom->cookie.len > 0 &&
846                     nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
847                             tfom->cookie.len, tfom->cookie.val) < 0)
848                         goto nla_put_failure;
849         }
850
851         return 0;
852
853 nla_put_failure:
854         return -EMSGSIZE;
855 }
856
857 static int tcp_metrics_dump_info(struct sk_buff *skb,
858                                  struct netlink_callback *cb,
859                                  struct tcp_metrics_block *tm)
860 {
861         void *hdr;
862
863         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
864                           &tcp_metrics_nl_family, NLM_F_MULTI,
865                           TCP_METRICS_CMD_GET);
866         if (!hdr)
867                 return -EMSGSIZE;
868
869         if (tcp_metrics_fill_info(skb, tm) < 0)
870                 goto nla_put_failure;
871
872         return genlmsg_end(skb, hdr);
873
874 nla_put_failure:
875         genlmsg_cancel(skb, hdr);
876         return -EMSGSIZE;
877 }
878
879 static int tcp_metrics_nl_dump(struct sk_buff *skb,
880                                struct netlink_callback *cb)
881 {
882         struct net *net = sock_net(skb->sk);
883         unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
884         unsigned int row, s_row = cb->args[0];
885         int s_col = cb->args[1], col = s_col;
886
887         for (row = s_row; row < max_rows; row++, s_col = 0) {
888                 struct tcp_metrics_block *tm;
889                 struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash + row;
890
891                 rcu_read_lock();
892                 for (col = 0, tm = rcu_dereference(hb->chain); tm;
893                      tm = rcu_dereference(tm->tcpm_next), col++) {
894                         if (col < s_col)
895                                 continue;
896                         if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
897                                 rcu_read_unlock();
898                                 goto done;
899                         }
900                 }
901                 rcu_read_unlock();
902         }
903
904 done:
905         cb->args[0] = row;
906         cb->args[1] = col;
907         return skb->len;
908 }
909
910 static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
911                            unsigned int *hash, int optional, int v4, int v6)
912 {
913         struct nlattr *a;
914
915         a = info->attrs[v4];
916         if (a) {
917                 addr->family = AF_INET;
918                 addr->addr.a4 = nla_get_be32(a);
919                 if (hash)
920                         *hash = (__force unsigned int) addr->addr.a4;
921                 return 0;
922         }
923         a = info->attrs[v6];
924         if (a) {
925                 if (nla_len(a) != sizeof(struct in6_addr))
926                         return -EINVAL;
927                 addr->family = AF_INET6;
928                 memcpy(addr->addr.a6, nla_data(a), sizeof(addr->addr.a6));
929                 if (hash)
930                         *hash = ipv6_addr_hash((struct in6_addr *) addr->addr.a6);
931                 return 0;
932         }
933         return optional ? 1 : -EAFNOSUPPORT;
934 }
935
936 static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
937                          unsigned int *hash, int optional)
938 {
939         return __parse_nl_addr(info, addr, hash, optional,
940                                TCP_METRICS_ATTR_ADDR_IPV4,
941                                TCP_METRICS_ATTR_ADDR_IPV6);
942 }
943
944 static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr)
945 {
946         return __parse_nl_addr(info, addr, NULL, 0,
947                                TCP_METRICS_ATTR_SADDR_IPV4,
948                                TCP_METRICS_ATTR_SADDR_IPV6);
949 }
950
951 static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info)
952 {
953         struct tcp_metrics_block *tm;
954         struct inetpeer_addr saddr, daddr;
955         unsigned int hash;
956         struct sk_buff *msg;
957         struct net *net = genl_info_net(info);
958         void *reply;
959         int ret;
960         bool src = true;
961
962         ret = parse_nl_addr(info, &daddr, &hash, 0);
963         if (ret < 0)
964                 return ret;
965
966         ret = parse_nl_saddr(info, &saddr);
967         if (ret < 0)
968                 src = false;
969
970         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
971         if (!msg)
972                 return -ENOMEM;
973
974         reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0,
975                                   info->genlhdr->cmd);
976         if (!reply)
977                 goto nla_put_failure;
978
979         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
980         ret = -ESRCH;
981         rcu_read_lock();
982         for (tm = rcu_dereference(net->ipv4.tcp_metrics_hash[hash].chain); tm;
983              tm = rcu_dereference(tm->tcpm_next)) {
984                 if (addr_same(&tm->tcpm_daddr, &daddr) &&
985                     (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
986                         ret = tcp_metrics_fill_info(msg, tm);
987                         break;
988                 }
989         }
990         rcu_read_unlock();
991         if (ret < 0)
992                 goto out_free;
993
994         genlmsg_end(msg, reply);
995         return genlmsg_reply(msg, info);
996
997 nla_put_failure:
998         ret = -EMSGSIZE;
999
1000 out_free:
1001         nlmsg_free(msg);
1002         return ret;
1003 }
1004
1005 #define deref_locked_genl(p)    \
1006         rcu_dereference_protected(p, lockdep_genl_is_held() && \
1007                                      lockdep_is_held(&tcp_metrics_lock))
1008
1009 #define deref_genl(p)   rcu_dereference_protected(p, lockdep_genl_is_held())
1010
1011 static int tcp_metrics_flush_all(struct net *net)
1012 {
1013         unsigned int max_rows = 1U << net->ipv4.tcp_metrics_hash_log;
1014         struct tcpm_hash_bucket *hb = net->ipv4.tcp_metrics_hash;
1015         struct tcp_metrics_block *tm;
1016         unsigned int row;
1017
1018         for (row = 0; row < max_rows; row++, hb++) {
1019                 spin_lock_bh(&tcp_metrics_lock);
1020                 tm = deref_locked_genl(hb->chain);
1021                 if (tm)
1022                         hb->chain = NULL;
1023                 spin_unlock_bh(&tcp_metrics_lock);
1024                 while (tm) {
1025                         struct tcp_metrics_block *next;
1026
1027                         next = deref_genl(tm->tcpm_next);
1028                         kfree_rcu(tm, rcu_head);
1029                         tm = next;
1030                 }
1031         }
1032         return 0;
1033 }
1034
1035 static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
1036 {
1037         struct tcpm_hash_bucket *hb;
1038         struct tcp_metrics_block *tm;
1039         struct tcp_metrics_block __rcu **pp;
1040         struct inetpeer_addr saddr, daddr;
1041         unsigned int hash;
1042         struct net *net = genl_info_net(info);
1043         int ret;
1044         bool src = true, found = false;
1045
1046         ret = parse_nl_addr(info, &daddr, &hash, 1);
1047         if (ret < 0)
1048                 return ret;
1049         if (ret > 0)
1050                 return tcp_metrics_flush_all(net);
1051         ret = parse_nl_saddr(info, &saddr);
1052         if (ret < 0)
1053                 src = false;
1054
1055         hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log);
1056         hb = net->ipv4.tcp_metrics_hash + hash;
1057         pp = &hb->chain;
1058         spin_lock_bh(&tcp_metrics_lock);
1059         for (tm = deref_locked_genl(*pp); tm; tm = deref_locked_genl(*pp)) {
1060                 if (addr_same(&tm->tcpm_daddr, &daddr) &&
1061                     (!src || addr_same(&tm->tcpm_saddr, &saddr))) {
1062                         *pp = tm->tcpm_next;
1063                         kfree_rcu(tm, rcu_head);
1064                         found = true;
1065                 } else {
1066                         pp = &tm->tcpm_next;
1067                 }
1068         }
1069         spin_unlock_bh(&tcp_metrics_lock);
1070         if (!found)
1071                 return -ESRCH;
1072         return 0;
1073 }
1074
1075 static const struct genl_ops tcp_metrics_nl_ops[] = {
1076         {
1077                 .cmd = TCP_METRICS_CMD_GET,
1078                 .doit = tcp_metrics_nl_cmd_get,
1079                 .dumpit = tcp_metrics_nl_dump,
1080                 .policy = tcp_metrics_nl_policy,
1081                 .flags = GENL_ADMIN_PERM,
1082         },
1083         {
1084                 .cmd = TCP_METRICS_CMD_DEL,
1085                 .doit = tcp_metrics_nl_cmd_del,
1086                 .policy = tcp_metrics_nl_policy,
1087                 .flags = GENL_ADMIN_PERM,
1088         },
1089 };
1090
1091 static unsigned int tcpmhash_entries;
1092 static int __init set_tcpmhash_entries(char *str)
1093 {
1094         ssize_t ret;
1095
1096         if (!str)
1097                 return 0;
1098
1099         ret = kstrtouint(str, 0, &tcpmhash_entries);
1100         if (ret)
1101                 return 0;
1102
1103         return 1;
1104 }
1105 __setup("tcpmhash_entries=", set_tcpmhash_entries);
1106
1107 static int __net_init tcp_net_metrics_init(struct net *net)
1108 {
1109         size_t size;
1110         unsigned int slots;
1111
1112         slots = tcpmhash_entries;
1113         if (!slots) {
1114                 if (totalram_pages >= 128 * 1024)
1115                         slots = 16 * 1024;
1116                 else
1117                         slots = 8 * 1024;
1118         }
1119
1120         net->ipv4.tcp_metrics_hash_log = order_base_2(slots);
1121         size = sizeof(struct tcpm_hash_bucket) << net->ipv4.tcp_metrics_hash_log;
1122
1123         net->ipv4.tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1124         if (!net->ipv4.tcp_metrics_hash)
1125                 net->ipv4.tcp_metrics_hash = vzalloc(size);
1126
1127         if (!net->ipv4.tcp_metrics_hash)
1128                 return -ENOMEM;
1129
1130         return 0;
1131 }
1132
1133 static void __net_exit tcp_net_metrics_exit(struct net *net)
1134 {
1135         unsigned int i;
1136
1137         for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
1138                 struct tcp_metrics_block *tm, *next;
1139
1140                 tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
1141                 while (tm) {
1142                         next = rcu_dereference_protected(tm->tcpm_next, 1);
1143                         kfree(tm);
1144                         tm = next;
1145                 }
1146         }
1147         if (is_vmalloc_addr(net->ipv4.tcp_metrics_hash))
1148                 vfree(net->ipv4.tcp_metrics_hash);
1149         else
1150                 kfree(net->ipv4.tcp_metrics_hash);
1151 }
1152
1153 static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
1154         .init   =       tcp_net_metrics_init,
1155         .exit   =       tcp_net_metrics_exit,
1156 };
1157
1158 void __init tcp_metrics_init(void)
1159 {
1160         int ret;
1161
1162         ret = register_pernet_subsys(&tcp_net_metrics_ops);
1163         if (ret < 0)
1164                 goto cleanup;
1165         ret = genl_register_family_with_ops(&tcp_metrics_nl_family,
1166                                             tcp_metrics_nl_ops);
1167         if (ret < 0)
1168                 goto cleanup_subsys;
1169         return;
1170
1171 cleanup_subsys:
1172         unregister_pernet_subsys(&tcp_net_metrics_ops);
1173
1174 cleanup:
1175         return;
1176 }