Merge commit 'v2.6.26' into sched/devel

[~andy/linux] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 0d197be3e3e9dde796363d33da6a7f1c515f2bd8..f2aa987027d695750f2ca4b8f917d02171eeb3b8 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                 __enqueue_entity(cfs_rq, se);
  }
  
-static void update_avg(u64 *avg, u64 sample)
-{
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
-}
-
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       if (!se->last_wakeup)
-               return;
-
-       update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-       se->last_wakeup = 0;
-}
-
  static void
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  {
@@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  
         update_stats_dequeue(cfs_rq, se);
         if (sleep) {
-               update_avg_stats(cfs_rq, se);
  #ifdef CONFIG_SCHEDSTATS
                 if (entity_is_task(se)) {
                         struct task_struct *tsk = task_of(se);
@@ -921,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                 hrtick_start(rq, delta, requeue);
         }
  }
-#else
+#else /* !CONFIG_SCHED_HRTICK */
  static inline void
  hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
@@ -1062,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p)
         }
         return cpu;
  }
-#else
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
  static inline int wake_idle(int cpu, struct task_struct *p)
  {
         return cpu;
@@ -1074,10 +1058,50 @@ static inline int wake_idle(int cpu, struct task_struct *p)
  static const struct sched_class fair_sched_class;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
+static long effective_load(struct task_group *tg, int cpu,
+               long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
-       long wg = wl;
+       long more_w;
+
+       if (!tg->parent)
+               return wl;
+
+       /*
+        * By not taking the decrease of shares on the other cpu into
+        * account our error leans towards reducing the affine wakeups.
+        */
+       if (!wl && sched_feat(ASYM_EFF_LOAD))
+               return wl;
+
+       /*
+        * Instead of using this increment, also add the difference
+        * between when the shares were last updated and now.
+        */
+       more_w = se->my_q->load.weight - se->my_q->rq_weight;
+       wl += more_w;
+       wg += more_w;
  
         for_each_sched_entity(se) {
  #define D(n) (likely(n) ? (n) : 1)
@@ -1086,12 +1110,19 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
  
                 S = se->my_q->tg->shares;
                 s = se->my_q->shares;
-               rw = se->my_q->load.weight;
+               rw = se->my_q->rq_weight;
  
                 a = S*(rw + wl);
                 b = S*rw + s*wg;
  
                 wl = s*(a-b)/D(b);
+               /*
+                * Assume the group is already running and will
+                * thus already be accounted for in the weight.
+                *
+                * That is, moving shares between CPUs, does not
+                * alter the group weight.
+                */
                 wg = 0;
  #undef D
         }
@@ -1099,26 +1130,12 @@ static unsigned long effective_load(struct task_group *tg, long wl, int cpu)
         return wl;
  }
  
-static unsigned long task_load_sub(struct task_struct *p)
-{
-       return effective_load(task_group(p), -(long)p->se.load.weight, task_cpu(p));
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-       return effective_load(task_group(p), p->se.load.weight, cpu);
-}
-
  #else
  
-static unsigned long task_load_sub(struct task_struct *p)
+static inline unsigned long effective_load(struct task_group *tg, int cpu,
+               unsigned long wl, unsigned long wg)
  {
-       return -p->se.load.weight;
-}
-
-static unsigned long task_load_add(struct task_struct *p, int cpu)
-{
-       return p->se.load.weight;
+       return wl;
  }
  
  #endif
@@ -1130,8 +1147,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
             unsigned int imbalance)
  {
         struct task_struct *curr = this_rq->curr;
+       struct task_group *tg;
         unsigned long tl = this_load;
         unsigned long tl_per_task;
+       unsigned long weight;
         int balanced;
  
         if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
@@ -1142,19 +1161,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
          * effect of the currently running task from the load
          * of the current CPU:
          */
-       if (sync)
-               tl += task_load_sub(current);
+       if (sync) {
+               tg = task_group(current);
+               weight = current->se.load.weight;
  
-       balanced = 100*(tl + task_load_add(p, this_cpu)) <= imbalance*load;
+               tl += effective_load(tg, this_cpu, -weight, -weight);
+               load += effective_load(tg, prev_cpu, 0, -weight);
+       }
+
+       tg = task_group(p);
+       weight = p->se.load.weight;
+
+       balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+               imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
  
         /*
          * If the currently running task will sleep within
          * a reasonable amount of time then attract this newly
          * woken task:
          */
-       if (sync && balanced && curr->sched_class == &fair_sched_class) {
+       if (sync && balanced) {
                 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                               p->se.avg_overlap < sysctl_sched_migration_cost)
+                   p->se.avg_overlap < sysctl_sched_migration_cost)
                         return 1;
         }
  
@@ -1315,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
                 return;
         }
  
-       se->last_wakeup = se->sum_exec_runtime;
         if (unlikely(se == pse))
                 return;
  
@@ -1477,7 +1504,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
                 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
                 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
-               long rem_load, moved_load;
+               u64 rem_load, moved_load;
  
                 /*
                  * empty group
@@ -1485,8 +1512,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 if (!busiest_cfs_rq->task_weight)
                         continue;
  
-               rem_load = rem_load_move * busiest_weight;
-               rem_load /= busiest_h_load + 1;
+               rem_load = (u64)rem_load_move * busiest_weight;
+               rem_load = div_u64(rem_load, busiest_h_load + 1);
  
                 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
                                 rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1496,7 +1523,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                         continue;
  
                 moved_load *= busiest_h_load;
-               moved_load /= busiest_weight + 1;
+               moved_load = div_u64(moved_load, busiest_weight + 1);
  
                 rem_load_move -= moved_load;
                 if (rem_load_move < 0)
@@ -1542,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  
         return 0;
  }
-#endif
+#endif /* CONFIG_SMP */
  
  /*
   * scheduler tick hitting a task of our scheduling class: