]> Pileus Git - ~andy/linux/blobdiff - kernel/sched/fair.c
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel...
[~andy/linux] / kernel / sched / fair.c
index 3e18f611a5aa6d15e41c2e32186da7587e386ef7..9af5af979a1344ec8ddd4bab9f392238e8c00670 100644 (file)
@@ -262,6 +262,9 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
 
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+                                      int force_update);
+
 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        if (!cfs_rq->on_list) {
@@ -281,6 +284,8 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
                }
 
                cfs_rq->on_list = 1;
+               /* We should have no load, but we need to update last_decay. */
+               update_cfs_rq_blocked_load(cfs_rq, 0);
        }
 }
 
@@ -656,9 +661,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
        return calc_delta_fair(sched_slice(cfs_rq, se), se);
 }
 
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq);
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -678,10 +680,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 
        curr->vruntime += delta_exec_weighted;
        update_min_vruntime(cfs_rq);
-
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-       cfs_rq->load_unacc_exec_time += delta_exec;
-#endif
 }
 
 static void update_curr(struct cfs_rq *cfs_rq)
@@ -1025,72 +1023,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* we need this in update_cfs_load and load-balance functions below */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
-static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
-                                           int global_update)
-{
-       struct task_group *tg = cfs_rq->tg;
-       long load_avg;
-
-       load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
-       load_avg -= cfs_rq->load_contribution;
-
-       if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
-               atomic_add(load_avg, &tg->load_weight);
-               cfs_rq->load_contribution += load_avg;
-       }
-}
-
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-       u64 period = sysctl_sched_shares_window;
-       u64 now, delta;
-       unsigned long load = cfs_rq->load.weight;
-
-       if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
-               return;
-
-       now = rq_of(cfs_rq)->clock_task;
-       delta = now - cfs_rq->load_stamp;
-
-       /* truncate load history at 4 idle periods */
-       if (cfs_rq->load_stamp > cfs_rq->load_last &&
-           now - cfs_rq->load_last > 4 * period) {
-               cfs_rq->load_period = 0;
-               cfs_rq->load_avg = 0;
-               delta = period - 1;
-       }
-
-       cfs_rq->load_stamp = now;
-       cfs_rq->load_unacc_exec_time = 0;
-       cfs_rq->load_period += delta;
-       if (load) {
-               cfs_rq->load_last = now;
-               cfs_rq->load_avg += delta * load;
-       }
-
-       /* consider updating load contribution on each fold or truncate */
-       if (global_update || cfs_rq->load_period > period
-           || !cfs_rq->load_period)
-               update_cfs_rq_load_contribution(cfs_rq, global_update);
-
-       while (cfs_rq->load_period > period) {
-               /*
-                * Inline assembly required to prevent the compiler
-                * optimising this loop into a divmod call.
-                * See __iter_div_u64_rem() for another example of this.
-                */
-               asm("" : "+rm" (cfs_rq->load_period));
-               cfs_rq->load_period /= 2;
-               cfs_rq->load_avg /= 2;
-       }
-
-       if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
-               list_del_leaf_cfs_rq(cfs_rq);
-}
-
 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 {
        long tg_weight;
@@ -1100,8 +1033,8 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
         * to gain a more accurate current total weight. See
         * update_cfs_rq_load_contribution().
         */
-       tg_weight = atomic_read(&tg->load_weight);
-       tg_weight -= cfs_rq->load_contribution;
+       tg_weight = atomic64_read(&tg->load_avg);
+       tg_weight -= cfs_rq->tg_load_contrib;
        tg_weight += cfs_rq->load.weight;
 
        return tg_weight;
@@ -1125,27 +1058,11 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 
        return shares;
 }
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-       if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-               update_cfs_load(cfs_rq, 0);
-               update_cfs_shares(cfs_rq);
-       }
-}
 # else /* CONFIG_SMP */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
-{
-}
-
 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
        return tg->shares;
 }
-
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-}
 # endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                            unsigned long weight)
@@ -1163,6 +1080,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
                account_entity_enqueue(cfs_rq, se);
 }
 
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
 static void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
        struct task_group *tg;
@@ -1182,18 +1101,477 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
        reweight_entity(cfs_rq_of(se), se, shares);
 }
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 {
 }
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+/*
+ * We choose a half-life close to 1 scheduling period.
+ * Note: The tables below are dependent on this value.
+ */
+#define LOAD_AVG_PERIOD 32
+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
+
+/* Precomputed fixed inverse multiplies for multiplication by y^n */
+static const u32 runnable_avg_yN_inv[] = {
+       0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
+       0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
+       0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
+       0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
+       0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
+       0x85aac367, 0x82cd8698,
+};
+
+/*
+ * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
+ * over-estimates when re-combining.
+ */
+static const u32 runnable_avg_yN_sum[] = {
+           0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
+        9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
+       17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
+};
+
+/*
+ * Approximate:
+ *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static __always_inline u64 decay_load(u64 val, u64 n)
 {
+       unsigned int local_n;
+
+       if (!n)
+               return val;
+       else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+               return 0;
+
+       /* after bounds checking we can collapse to 32-bit */
+       local_n = n;
+
+       /*
+        * As y^PERIOD = 1/2, we can combine
+        *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+        * With a look-up table which covers k^n (n<PERIOD)
+        *
+        * To achieve constant time decay_load.
+        */
+       if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+               val >>= local_n / LOAD_AVG_PERIOD;
+               local_n %= LOAD_AVG_PERIOD;
+       }
+
+       val *= runnable_avg_yN_inv[local_n];
+       /* We don't use SRR here since we always want to round down. */
+       return val >> 32;
 }
 
-static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+/*
+ * For updates fully spanning n periods, the contribution to runnable
+ * average will be: \Sum 1024*y^n
+ *
+ * We can compute this reasonably efficiently by combining:
+ *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
+ */
+static u32 __compute_runnable_contrib(u64 n)
 {
+       u32 contrib = 0;
+
+       if (likely(n <= LOAD_AVG_PERIOD))
+               return runnable_avg_yN_sum[n];
+       else if (unlikely(n >= LOAD_AVG_MAX_N))
+               return LOAD_AVG_MAX;
+
+       /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
+       do {
+               contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
+               contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
+
+               n -= LOAD_AVG_PERIOD;
+       } while (n > LOAD_AVG_PERIOD);
+
+       contrib = decay_load(contrib, n);
+       return contrib + runnable_avg_yN_sum[n];
 }
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series.  To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ *      p0            p1           p2
+ *     (now)       (~1ms ago)  (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ *   y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int __update_entity_runnable_avg(u64 now,
+                                                       struct sched_avg *sa,
+                                                       int runnable)
+{
+       u64 delta, periods;
+       u32 runnable_contrib;
+       int delta_w, decayed = 0;
+
+       delta = now - sa->last_runnable_update;
+       /*
+        * This should only happen when time goes backwards, which it
+        * unfortunately does during sched clock init when we swap over to TSC.
+        */
+       if ((s64)delta < 0) {
+               sa->last_runnable_update = now;
+               return 0;
+       }
+
+       /*
+        * Use 1024ns as the unit of measurement since it's a reasonable
+        * approximation of 1us and fast to compute.
+        */
+       delta >>= 10;
+       if (!delta)
+               return 0;
+       sa->last_runnable_update = now;
+
+       /* delta_w is the amount already accumulated against our next period */
+       delta_w = sa->runnable_avg_period % 1024;
+       if (delta + delta_w >= 1024) {
+               /* period roll-over */
+               decayed = 1;
+
+               /*
+                * Now that we know we're crossing a period boundary, figure
+                * out how much from delta we need to complete the current
+                * period and accrue it.
+                */
+               delta_w = 1024 - delta_w;
+               if (runnable)
+                       sa->runnable_avg_sum += delta_w;
+               sa->runnable_avg_period += delta_w;
+
+               delta -= delta_w;
+
+               /* Figure out how many additional periods this update spans */
+               periods = delta / 1024;
+               delta %= 1024;
+
+               sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
+                                                 periods + 1);
+               sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
+                                                    periods + 1);
+
+               /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+               runnable_contrib = __compute_runnable_contrib(periods);
+               if (runnable)
+                       sa->runnable_avg_sum += runnable_contrib;
+               sa->runnable_avg_period += runnable_contrib;
+       }
+
+       /* Remainder of delta accrued against u_0` */
+       if (runnable)
+               sa->runnable_avg_sum += delta;
+       sa->runnable_avg_period += delta;
+
+       return decayed;
+}
+
+/* Synchronize an entity's decay with its parenting cfs_rq.*/
+static inline u64 __synchronize_entity_decay(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 decays = atomic64_read(&cfs_rq->decay_counter);
+
+       decays -= se->avg.decay_count;
+       if (!decays)
+               return 0;
+
+       se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+       se->avg.decay_count = 0;
+
+       return decays;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+                                                int force_update)
+{
+       struct task_group *tg = cfs_rq->tg;
+       s64 tg_contrib;
+
+       tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
+       tg_contrib -= cfs_rq->tg_load_contrib;
+
+       if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
+               atomic64_add(tg_contrib, &tg->load_avg);
+               cfs_rq->tg_load_contrib += tg_contrib;
+       }
+}
+
+/*
+ * Aggregate cfs_rq runnable averages into an equivalent task_group
+ * representation for computing load contributions.
+ */
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+                                                 struct cfs_rq *cfs_rq)
+{
+       struct task_group *tg = cfs_rq->tg;
+       long contrib;
+
+       /* The fraction of a cpu used by this cfs_rq */
+       contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+                         sa->runnable_avg_period + 1);
+       contrib -= cfs_rq->tg_runnable_contrib;
+
+       if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+               atomic_add(contrib, &tg->runnable_avg);
+               cfs_rq->tg_runnable_contrib += contrib;
+       }
+}
+
+static inline void __update_group_entity_contrib(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = group_cfs_rq(se);
+       struct task_group *tg = cfs_rq->tg;
+       int runnable_avg;
+
+       u64 contrib;
+
+       contrib = cfs_rq->tg_load_contrib * tg->shares;
+       se->avg.load_avg_contrib = div64_u64(contrib,
+                                            atomic64_read(&tg->load_avg) + 1);
+
+       /*
+        * For group entities we need to compute a correction term in the case
+        * that they are consuming <1 cpu so that we would contribute the same
+        * load as a task of equal weight.
+        *
+        * Explicitly co-ordinating this measurement would be expensive, but
+        * fortunately the sum of each cpus contribution forms a usable
+        * lower-bound on the true value.
+        *
+        * Consider the aggregate of 2 contributions.  Either they are disjoint
+        * (and the sum represents true value) or they are disjoint and we are
+        * understating by the aggregate of their overlap.
+        *
+        * Extending this to N cpus, for a given overlap, the maximum amount we
+        * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
+        * cpus that overlap for this interval and w_i is the interval width.
+        *
+        * On a small machine; the first term is well-bounded which bounds the
+        * total error since w_i is a subset of the period.  Whereas on a
+        * larger machine, while this first term can be larger, if w_i is the
+        * of consequential size guaranteed to see n_i*w_i quickly converge to
+        * our upper bound of 1-cpu.
+        */
+       runnable_avg = atomic_read(&tg->runnable_avg);
+       if (runnable_avg < NICE_0_LOAD) {
+               se->avg.load_avg_contrib *= runnable_avg;
+               se->avg.load_avg_contrib >>= NICE_0_SHIFT;
+       }
+}
+#else
+static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
+                                                int force_update) {}
+static inline void __update_tg_runnable_avg(struct sched_avg *sa,
+                                                 struct cfs_rq *cfs_rq) {}
+static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+#endif
+
+static inline void __update_task_entity_contrib(struct sched_entity *se)
+{
+       u32 contrib;
+
+       /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
+       contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
+       contrib /= (se->avg.runnable_avg_period + 1);
+       se->avg.load_avg_contrib = scale_load(contrib);
+}
+
+/* Compute the current contribution to load_avg by se, return any delta */
+static long __update_entity_load_avg_contrib(struct sched_entity *se)
+{
+       long old_contrib = se->avg.load_avg_contrib;
+
+       if (entity_is_task(se)) {
+               __update_task_entity_contrib(se);
+       } else {
+               __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
+               __update_group_entity_contrib(se);
+       }
+
+       return se->avg.load_avg_contrib - old_contrib;
+}
+
+static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
+                                                long load_contrib)
+{
+       if (likely(load_contrib < cfs_rq->blocked_load_avg))
+               cfs_rq->blocked_load_avg -= load_contrib;
+       else
+               cfs_rq->blocked_load_avg = 0;
+}
+
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+
+/* Update a sched_entity's runnable average */
+static inline void update_entity_load_avg(struct sched_entity *se,
+                                         int update_cfs_rq)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       long contrib_delta;
+       u64 now;
+
+       /*
+        * For a group entity we need to use their owned cfs_rq_clock_task() in
+        * case they are the parent of a throttled hierarchy.
+        */
+       if (entity_is_task(se))
+               now = cfs_rq_clock_task(cfs_rq);
+       else
+               now = cfs_rq_clock_task(group_cfs_rq(se));
+
+       if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+               return;
+
+       contrib_delta = __update_entity_load_avg_contrib(se);
+
+       if (!update_cfs_rq)
+               return;
+
+       if (se->on_rq)
+               cfs_rq->runnable_load_avg += contrib_delta;
+       else
+               subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+}
+
+/*
+ * Decay the load contributed by all blocked children and account this so that
+ * their contribution may appropriately discounted when they wake up.
+ */
+static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
+{
+       u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
+       u64 decays;
+
+       decays = now - cfs_rq->last_decay;
+       if (!decays && !force_update)
+               return;
+
+       if (atomic64_read(&cfs_rq->removed_load)) {
+               u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
+               subtract_blocked_load_contrib(cfs_rq, removed_load);
+       }
+
+       if (decays) {
+               cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
+                                                     decays);
+               atomic64_add(decays, &cfs_rq->decay_counter);
+               cfs_rq->last_decay = now;
+       }
+
+       __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
+}
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+       __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+       __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+
+/* Add the load generated by se into cfs_rq's child load-average */
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                                 struct sched_entity *se,
+                                                 int wakeup)
+{
+       /*
+        * We track migrations using entity decay_count <= 0, on a wake-up
+        * migration we use a negative decay count to track the remote decays
+        * accumulated while sleeping.
+        */
+       if (unlikely(se->avg.decay_count <= 0)) {
+               se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
+               if (se->avg.decay_count) {
+                       /*
+                        * In a wake-up migration we have to approximate the
+                        * time sleeping.  This is because we can't synchronize
+                        * clock_task between the two cpus, and it is not
+                        * guaranteed to be read-safe.  Instead, we can
+                        * approximate this using our carried decays, which are
+                        * explicitly atomically readable.
+                        */
+                       se->avg.last_runnable_update -= (-se->avg.decay_count)
+                                                       << 20;
+                       update_entity_load_avg(se, 0);
+                       /* Indicate that we're now synchronized and on-rq */
+                       se->avg.decay_count = 0;
+               }
+               wakeup = 0;
+       } else {
+               __synchronize_entity_decay(se);
+       }
+
+       /* migrated tasks did not contribute to our blocked load */
+       if (wakeup) {
+               subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
+               update_entity_load_avg(se, 0);
+       }
+
+       cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+       /* we force update consideration on load-balancer moves */
+       update_cfs_rq_blocked_load(cfs_rq, !wakeup);
+}
+
+/*
+ * Remove se's load from this cfs_rq child load-average, if the entity is
+ * transitioning to a blocked state we track its projected decay using
+ * blocked_load_avg.
+ */
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                                 struct sched_entity *se,
+                                                 int sleep)
+{
+       update_entity_load_avg(se, 1);
+       /* we force update consideration on load-balancer moves */
+       update_cfs_rq_blocked_load(cfs_rq, !sleep);
+
+       cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+       if (sleep) {
+               cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
+               se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+       } /* migrations, e.g. sleep=0 leave decay_count == 0 */
+}
+#else
+static inline void update_entity_load_avg(struct sched_entity *se,
+                                         int update_cfs_rq) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                          struct sched_entity *se,
+                                          int wakeup) {}
+static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
+                                          struct sched_entity *se,
+                                          int sleep) {}
+static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
+                                             int force_update) {}
+#endif
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -1320,7 +1698,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
-       update_cfs_load(cfs_rq, 0);
+       enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
 
@@ -1395,6 +1773,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
+       dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
 
        update_stats_dequeue(cfs_rq, se);
        if (flags & DEQUEUE_SLEEP) {
@@ -1415,7 +1794,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                __dequeue_entity(cfs_rq, se);
        se->on_rq = 0;
-       update_cfs_load(cfs_rq, 0);
        account_entity_dequeue(cfs_rq, se);
 
        /*
@@ -1564,6 +1942,8 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
                update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
+               /* in !on_rq case, update occurred at dequeue */
+               update_entity_load_avg(prev, 1);
        }
        cfs_rq->curr = NULL;
 }
@@ -1577,9 +1957,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
        update_curr(cfs_rq);
 
        /*
-        * Update share accounting for long-running entities.
+        * Ensure that runnable average is periodically updated.
         */
-       update_entity_shares_tick(cfs_rq);
+       update_entity_load_avg(curr, 1);
+       update_cfs_rq_blocked_load(cfs_rq, 1);
 
 #ifdef CONFIG_SCHED_HRTICK
        /*
@@ -1672,6 +2053,15 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
        return &tg->cfs_bandwidth;
 }
 
+/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+       if (unlikely(cfs_rq->throttle_count))
+               return cfs_rq->throttled_clock_task;
+
+       return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
+}
+
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -1816,14 +2206,9 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
        cfs_rq->throttle_count--;
 #ifdef CONFIG_SMP
        if (!cfs_rq->throttle_count) {
-               u64 delta = rq->clock_task - cfs_rq->load_stamp;
-
-               /* leaving throttled state, advance shares averaging windows */
-               cfs_rq->load_stamp += delta;
-               cfs_rq->load_last += delta;
-
-               /* update entity weight now that we are on_rq again */
-               update_cfs_shares(cfs_rq);
+               /* adjust cfs_rq_clock_task() */
+               cfs_rq->throttled_clock_task_time += rq->clock_task -
+                                            cfs_rq->throttled_clock_task;
        }
 #endif
 
@@ -1835,9 +2220,9 @@ static int tg_throttle_down(struct task_group *tg, void *data)
        struct rq *rq = data;
        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
 
-       /* group is entering throttled state, record last load */
+       /* group is entering throttled state, stop time */
        if (!cfs_rq->throttle_count)
-               update_cfs_load(cfs_rq, 0);
+               cfs_rq->throttled_clock_task = rq->clock_task;
        cfs_rq->throttle_count++;
 
        return 0;
@@ -1852,7 +2237,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
-       /* account load preceding throttle */
+       /* freeze hierarchy runnable averages while throttled */
        rcu_read_lock();
        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
        rcu_read_unlock();
@@ -1876,7 +2261,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
                rq->nr_running -= task_delta;
 
        cfs_rq->throttled = 1;
-       cfs_rq->throttled_timestamp = rq->clock;
+       cfs_rq->throttled_clock = rq->clock;
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
        raw_spin_unlock(&cfs_b->lock);
@@ -1894,10 +2279,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 
        cfs_rq->throttled = 0;
        raw_spin_lock(&cfs_b->lock);
-       cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+       cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
        list_del_rcu(&cfs_rq->throttled_list);
        raw_spin_unlock(&cfs_b->lock);
-       cfs_rq->throttled_timestamp = 0;
 
        update_rq_clock(rq);
        /* update hierarchical throttle state */
@@ -2297,8 +2681,13 @@ static void unthrottle_offline_cfs_rqs(struct rq *rq)
 }
 
 #else /* CONFIG_CFS_BANDWIDTH */
-static __always_inline
-void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
+{
+       return rq_of(cfs_rq)->clock_task;
+}
+
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                    unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -2431,12 +2820,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
 
-               update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
+               update_entity_load_avg(se, 1);
        }
 
-       if (!se)
+       if (!se) {
+               update_rq_runnable_avg(rq, rq->nr_running);
                inc_nr_running(rq);
+       }
        hrtick_update(rq);
 }
 
@@ -2490,12 +2881,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
 
-               update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
+               update_entity_load_avg(se, 1);
        }
 
-       if (!se)
+       if (!se) {
                dec_nr_running(rq);
+               update_rq_runnable_avg(rq, 1);
+       }
        hrtick_update(rq);
 }
 
@@ -3005,6 +3398,37 @@ unlock:
 
        return new_cpu;
 }
+
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * cfs_rq_of(p) references at time of call are still valid and identify the
+ * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
+ * other assumptions, including the state of rq->lock, should be made.
+ */
+static void
+migrate_task_rq_fair(struct task_struct *p, int next_cpu)
+{
+       struct sched_entity *se = &p->se;
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+       /*
+        * Load tracking: accumulate removed load so that it can be processed
+        * when we next update owning cfs_rq under rq->lock.  Tasks contribute
+        * to blocked load iff they have a positive decay-count.  It can never
+        * be negative here since on-rq tasks have decay-count == 0.
+        */
+       if (se->avg.decay_count) {
+               se->avg.decay_count = -__synchronize_entity_decay(se);
+               atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
+       }
+}
+#endif
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -3131,7 +3555,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         * Batch and idle tasks do not preempt non-idle tasks (their preemption
         * is driven by the tick):
         */
-       if (unlikely(p->policy != SCHED_NORMAL))
+       if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
                return;
 
        find_matching_se(&se, &pse);
@@ -3257,8 +3681,122 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
 #ifdef CONFIG_SMP
 /**************************************************
- * Fair scheduling class load-balancing methods:
- */
+ * Fair scheduling class load-balancing methods.
+ *
+ * BASICS
+ *
+ * The purpose of load-balancing is to achieve the same basic fairness the
+ * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * time to each task. This is expressed in the following equation:
+ *
+ *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
+ *
+ * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * W_i,0 is defined as:
+ *
+ *   W_i,0 = \Sum_j w_i,j                                             (2)
+ *
+ * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * is derived from the nice value as per prio_to_weight[].
+ *
+ * The weight average is an exponential decay average of the instantaneous
+ * weight:
+ *
+ *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
+ *
+ * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * fraction of 'recent' time available for SCHED_OTHER task execution. But it
+ * can also include other factors [XXX].
+ *
+ * To achieve this balance we define a measure of imbalance which follows
+ * directly from (1):
+ *
+ *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
+ *
+ * We them move tasks around to minimize the imbalance. In the continuous
+ * function space it is obvious this converges, in the discrete case we get
+ * a few fun cases generally called infeasible weight scenarios.
+ *
+ * [XXX expand on:
+ *     - infeasible weights;
+ *     - local vs global optima in the discrete case. ]
+ *
+ *
+ * SCHED DOMAINS
+ *
+ * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
+ * for all i,j solution, we create a tree of cpus that follows the hardware
+ * topology where each level pairs two lower groups (or better). This results
+ * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * tree to only the first of the previous level and we decrease the frequency
+ * of load-balance at each level inv. proportional to the number of cpus in
+ * the groups.
+ *
+ * This yields:
+ *
+ *     log_2 n     1     n
+ *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
+ *     i = 0      2^i   2^i
+ *                               `- size of each group
+ *         |         |     `- number of cpus doing load-balance
+ *         |         `- freq
+ *         `- sum over all levels
+ *
+ * Coupled with a limit on how many tasks we can migrate every balance pass,
+ * this makes (5) the runtime complexity of the balancer.
+ *
+ * An important property here is that each CPU is still (indirectly) connected
+ * to every other cpu in at most O(log n) steps:
+ *
+ * The adjacency matrix of the resulting graph is given by:
+ *
+ *             log_2 n     
+ *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
+ *             k = 0
+ *
+ * And you'll find that:
+ *
+ *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
+ *
+ * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * The task movement gives a factor of O(m), giving a convergence complexity
+ * of:
+ *
+ *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
+ *
+ *
+ * WORK CONSERVING
+ *
+ * In order to avoid CPUs going idle while there's still work to do, new idle
+ * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * tree itself instead of relying on other CPUs to bring it work.
+ *
+ * This adds some complexity to both (5) and (8) but it reduces the total idle
+ * time.
+ *
+ * [XXX more?]
+ *
+ *
+ * CGROUPS
+ *
+ * Cgroups make a horror show out of (2), instead of a simple sum we get:
+ *
+ *                                s_k,i
+ *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
+ *                                 S_k
+ *
+ * Where
+ *
+ *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
+ *
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ *
+ * The big problem is S_k, its a global sum needed to compute a local (W_i)
+ * property.
+ *
+ * [XXX write more on how we solve this.. _after_ merging pjt's patches that
+ *      rewrite all of this once again.]
+ */ 
 
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
@@ -3524,52 +4062,58 @@ next:
 /*
  * update tg->load_weight by folding this cpu's load_avg
  */
-static int update_shares_cpu(struct task_group *tg, int cpu)
+static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
 {
-       struct cfs_rq *cfs_rq;
-       unsigned long flags;
-       struct rq *rq;
-
-       if (!tg->se[cpu])
-               return 0;
-
-       rq = cpu_rq(cpu);
-       cfs_rq = tg->cfs_rq[cpu];
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
-
-       update_rq_clock(rq);
-       update_cfs_load(cfs_rq, 1);
+       struct sched_entity *se = tg->se[cpu];
+       struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
 
-       /*
-        * We need to update shares after updating tg->load_weight in
-        * order to adjust the weight of groups with long running tasks.
-        */
-       update_cfs_shares(cfs_rq);
+       /* throttled entities do not contribute to load */
+       if (throttled_hierarchy(cfs_rq))
+               return;
 
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       update_cfs_rq_blocked_load(cfs_rq, 1);
 
-       return 0;
+       if (se) {
+               update_entity_load_avg(se, 1);
+               /*
+                * We pivot on our runnable average having decayed to zero for
+                * list removal.  This generally implies that all our children
+                * have also been removed (modulo rounding error or bandwidth
+                * control); however, such cases are rare and we can fix these
+                * at enqueue.
+                *
+                * TODO: fix up out-of-order children on enqueue.
+                */
+               if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
+                       list_del_leaf_cfs_rq(cfs_rq);
+       } else {
+               struct rq *rq = rq_of(cfs_rq);
+               update_rq_runnable_avg(rq, rq->nr_running);
+       }
 }
 
-static void update_shares(int cpu)
+static void update_blocked_averages(int cpu)
 {
-       struct cfs_rq *cfs_rq;
        struct rq *rq = cpu_rq(cpu);
+       struct cfs_rq *cfs_rq;
+       unsigned long flags;
 
-       rcu_read_lock();
+       raw_spin_lock_irqsave(&rq->lock, flags);
+       update_rq_clock(rq);
        /*
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
        for_each_leaf_cfs_rq(rq, cfs_rq) {
-               /* throttled entities do not contribute to load */
-               if (throttled_hierarchy(cfs_rq))
-                       continue;
-
-               update_shares_cpu(cfs_rq->tg, cpu);
+               /*
+                * Note: We may want to consider periodically releasing
+                * rq->lock about these updates so that creating many task
+                * groups does not result in continually extending hold time.
+                */
+               __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
        }
-       rcu_read_unlock();
+
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
 /*
@@ -3621,7 +4165,7 @@ static unsigned long task_h_load(struct task_struct *p)
        return load;
 }
 #else
-static inline void update_shares(int cpu)
+static inline void update_blocked_averages(int cpu)
 {
 }
 
@@ -4681,12 +5225,14 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
                return;
 
+       update_rq_runnable_avg(this_rq, 1);
+
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
         */
        raw_spin_unlock(&this_rq->lock);
 
-       update_shares(this_cpu);
+       update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -4941,7 +5487,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        int update_next_balance = 0;
        int need_serialize;
 
-       update_shares(cpu);
+       update_blocked_averages(cpu);
 
        rcu_read_lock();
        for_each_domain(cpu, sd) {
@@ -5181,6 +5727,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
        if (sched_feat_numa(NUMA))
                task_tick_numa(rq, curr);
+
+       update_rq_runnable_avg(rq, 1);
 }
 
 /*
@@ -5273,6 +5821,20 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
                place_entity(cfs_rq, se, 0);
                se->vruntime -= cfs_rq->min_vruntime;
        }
+
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+       /*
+       * Remove our load from contribution when we leave sched_fair
+       * and ensure we don't carry in an old decay_count if we
+       * switch back.
+       */
+       if (p->se.avg.decay_count) {
+               struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+               __synchronize_entity_decay(&p->se);
+               subtract_blocked_load_contrib(cfs_rq,
+                               p->se.avg.load_avg_contrib);
+       }
+#endif
 }
 
 /*
@@ -5319,11 +5881,16 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifndef CONFIG_64BIT
        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+       atomic64_set(&cfs_rq->decay_counter, 1);
+       atomic64_set(&cfs_rq->removed_load, 0);
+#endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+       struct cfs_rq *cfs_rq;
        /*
         * If the task was not on the rq at the time of this cgroup movement
         * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5355,8 +5922,19 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
        if (!on_rq)
                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
-       if (!on_rq)
-               p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+       if (!on_rq) {
+               cfs_rq = cfs_rq_of(&p->se);
+               p->se.vruntime += cfs_rq->min_vruntime;
+#ifdef CONFIG_SMP
+               /*
+                * migrate_task_rq_fair() will have removed our previous
+                * contribution, but we must synchronize for ongoing future
+                * decay.
+                */
+               p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+               cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+#endif
+       }
 }
 
 void free_fair_sched_group(struct task_group *tg)
@@ -5441,10 +6019,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 
        cfs_rq->tg = tg;
        cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-       /* allow initial update_cfs_load() to truncate */
-       cfs_rq->load_stamp = 1;
-#endif
        init_cfs_rq_runtime(cfs_rq);
 
        tg->cfs_rq[cpu] = cfs_rq;
@@ -5546,7 +6120,9 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_fair,
-
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       .migrate_task_rq        = migrate_task_rq_fair,
+#endif
        .rq_online              = rq_online_fair,
        .rq_offline             = rq_offline_fair,