sched/numa: Introduce migrate_swap()

[~andy/linux] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 11cd13667359862c58872a9ce6091391a1d40b86..b1e5061287ab6e935b1b5da9aef1d2f54cae7a14 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -818,11 +818,13 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
  
  #ifdef CONFIG_NUMA_BALANCING
  /*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
   */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
+unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
  
  /* Portion of address space to scan in MB */
  unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +832,310 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
  /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
  unsigned int sysctl_numa_balancing_scan_delay = 1000;
  
-static void task_numa_placement(struct task_struct *p)
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+       unsigned long rss = 0;
+       unsigned long nr_scan_pages;
+
+       /*
+        * Calculations based on RSS as non-present and empty pages are skipped
+        * by the PTE scanner and NUMA hinting faults should be trapped based
+        * on resident pages
+        */
+       nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+       rss = get_mm_rss(p->mm);
+       if (!rss)
+               rss = nr_scan_pages;
+
+       rss = round_up(rss, nr_scan_pages);
+       return rss / nr_scan_pages;
+}
+
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+
+static unsigned int task_scan_min(struct task_struct *p)
+{
+       unsigned int scan, floor;
+       unsigned int windows = 1;
+
+       if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+               windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+       floor = 1000 / windows;
+
+       scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+       return max_t(unsigned int, floor, scan);
+}
+
+static unsigned int task_scan_max(struct task_struct *p)
+{
+       unsigned int smin = task_scan_min(p);
+       unsigned int smax;
+
+       /* Watch for min being lower than max due to floor calculations */
+       smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+       return max(smin, smax);
+}
+
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+
+static inline int task_faults_idx(int nid, int priv)
+{
+       return 2 * nid + priv;
+}
+
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+       if (!p->numa_faults)
+               return 0;
+
+       return p->numa_faults[task_faults_idx(nid, 0)] +
+               p->numa_faults[task_faults_idx(nid, 1)];
+}
+
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+
+struct numa_stats {
+       unsigned long load;
+       s64 eff_load;
+       unsigned long faults;
+};
+
+struct task_numa_env {
+       struct task_struct *p;
+
+       int src_cpu, src_nid;
+       int dst_cpu, dst_nid;
+
+       struct numa_stats src_stats, dst_stats;
+
+       unsigned long best_load;
+       int best_cpu;
+};
+
+static int task_numa_migrate(struct task_struct *p)
+{
+       int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
+       struct task_numa_env env = {
+               .p = p,
+               .src_cpu = task_cpu(p),
+               .src_nid = cpu_to_node(task_cpu(p)),
+               .dst_cpu = node_cpu,
+               .dst_nid = p->numa_preferred_nid,
+               .best_load = ULONG_MAX,
+               .best_cpu = task_cpu(p),
+       };
+       struct sched_domain *sd;
+       int cpu;
+       struct task_group *tg = task_group(p);
+       unsigned long weight;
+       bool balanced;
+       int imbalance_pct, idx = -1;
+
+       /*
+        * Find the lowest common scheduling domain covering the nodes of both
+        * the CPU the task is currently running on and the target NUMA node.
+        */
+       rcu_read_lock();
+       for_each_domain(env.src_cpu, sd) {
+               if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
+                       /*
+                        * busy_idx is used for the load decision as it is the
+                        * same index used by the regular load balancer for an
+                        * active cpu.
+                        */
+                       idx = sd->busy_idx;
+                       imbalance_pct = sd->imbalance_pct;
+                       break;
+               }
+       }
+       rcu_read_unlock();
+
+       if (WARN_ON_ONCE(idx == -1))
+               return 0;
+
+       /*
+        * XXX the below is mostly nicked from wake_affine(); we should
+        * see about sharing a bit if at all possible; also it might want
+        * some per entity weight love.
+        */
+       weight = p->se.load.weight;
+       env.src_stats.load = source_load(env.src_cpu, idx);
+       env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
+       env.src_stats.eff_load *= power_of(env.src_cpu);
+       env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
+
+       for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
+               env.dst_cpu = cpu;
+               env.dst_stats.load = target_load(cpu, idx);
+
+               /* If the CPU is idle, use it */
+               if (!env.dst_stats.load) {
+                       env.best_cpu = cpu;
+                       goto migrate;
+               }
+
+               /* Otherwise check the target CPU load */
+               env.dst_stats.eff_load = 100;
+               env.dst_stats.eff_load *= power_of(cpu);
+               env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
+
+               /*
+                * Destination is considered balanced if the destination CPU is
+                * less loaded than the source CPU. Unfortunately there is a
+                * risk that a task running on a lightly loaded CPU will not
+                * migrate to its preferred node due to load imbalances.
+                */
+               balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
+               if (!balanced)
+                       continue;
+
+               if (env.dst_stats.eff_load < env.best_load) {
+                       env.best_load = env.dst_stats.eff_load;
+                       env.best_cpu = cpu;
+               }
+       }
+
+migrate:
+       return migrate_task_to(p, env.best_cpu);
+}
+
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
  {
-       int seq;
+       /* Success if task is already running on preferred CPU */
+       p->numa_migrate_retry = 0;
+       if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid) {
+               /*
+                * If migration is temporarily disabled due to a task migration
+                * then re-enable it now as the task is running on its
+                * preferred node and memory should migrate locally
+                */
+               if (!p->numa_migrate_seq)
+                       p->numa_migrate_seq++;
+               return;
+       }
  
-       if (!p->mm)     /* for example, ksmd faulting in a user's mm */
+       /* This task has no NUMA fault statistics yet */
+       if (unlikely(p->numa_preferred_nid == -1))
                 return;
+
+       /* Otherwise, try migrate to a CPU on the preferred node */
+       if (task_numa_migrate(p) != 0)
+               p->numa_migrate_retry = jiffies + HZ*5;
+}
+
+static void task_numa_placement(struct task_struct *p)
+{
+       int seq, nid, max_nid = -1;
+       unsigned long max_faults = 0;
+
         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
         if (p->numa_scan_seq == seq)
                 return;
         p->numa_scan_seq = seq;
+       p->numa_migrate_seq++;
+       p->numa_scan_period_max = task_scan_max(p);
+
+       /* Find the node with the highest number of faults */
+       for_each_online_node(nid) {
+               unsigned long faults;
+               int priv, i;
+
+               for (priv = 0; priv < 2; priv++) {
+                       i = task_faults_idx(nid, priv);
  
-       /* FIXME: Scheduling placement policy hints go here */
+                       /* Decay existing window, copy faults since last scan */
+                       p->numa_faults[i] >>= 1;
+                       p->numa_faults[i] += p->numa_faults_buffer[i];
+                       p->numa_faults_buffer[i] = 0;
+               }
+
+               /* Find maximum private faults */
+               faults = p->numa_faults[task_faults_idx(nid, 1)];
+               if (faults > max_faults) {
+                       max_faults = faults;
+                       max_nid = nid;
+               }
+       }
+
+       /* Preferred node as the node with the most faults */
+       if (max_faults && max_nid != p->numa_preferred_nid) {
+               /* Update the preferred nid and migrate task if possible */
+               p->numa_preferred_nid = max_nid;
+               p->numa_migrate_seq = 1;
+               numa_migrate_preferred(p);
+       }
  }
  
  /*
   * Got a PROT_NONE fault for a page on @node.
   */
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_nidpid, int node, int pages, bool migrated)
  {
         struct task_struct *p = current;
+       int priv;
  
         if (!numabalancing_enabled)
                 return;
  
-       /* FIXME: Allocate task-specific structure for placement policy here */
+       /* for example, ksmd faulting in a user's mm */
+       if (!p->mm)
+               return;
+
+       /*
+        * First accesses are treated as private, otherwise consider accesses
+        * to be private if the accessing pid has not changed
+        */
+       if (!nidpid_pid_unset(last_nidpid))
+               priv = ((p->pid & LAST__PID_MASK) == nidpid_to_pid(last_nidpid));
+       else
+               priv = 1;
+
+       /* Allocate buffer to track faults on a per-node basis */
+       if (unlikely(!p->numa_faults)) {
+               int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+
+               /* numa_faults and numa_faults_buffer share the allocation */
+               p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+               if (!p->numa_faults)
+                       return;
+
+               BUG_ON(p->numa_faults_buffer);
+               p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+       }
  
         /*
          * If pages are properly placed (did not migrate) then scan slower.
          * This is reset periodically in case of phase changes
          */
-        if (!migrated)
-               p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
-                       p->numa_scan_period + jiffies_to_msecs(10));
+       if (!migrated) {
+               /* Initialise if necessary */
+               if (!p->numa_scan_period_max)
+                       p->numa_scan_period_max = task_scan_max(p);
+
+               p->numa_scan_period = min(p->numa_scan_period_max,
+                       p->numa_scan_period + 10);
+       }
  
         task_numa_placement(p);
+
+       /* Retry task to preferred node migration if it previously failed */
+       if (p->numa_migrate_retry && time_after(jiffies, p->numa_migrate_retry))
+               numa_migrate_preferred(p);
+
+       p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
  }
  
  static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1155,7 @@ void task_numa_work(struct callback_head *work)
         struct mm_struct *mm = p->mm;
         struct vm_area_struct *vma;
         unsigned long start, end;
+       unsigned long nr_pte_updates = 0;
         long pages;
  
         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,22 +1172,11 @@ void task_numa_work(struct callback_head *work)
         if (p->flags & PF_EXITING)
                 return;
  
-       /*
-        * We do not care about task placement until a task runs on a node
-        * other than the first one used by the address space. This is
-        * largely because migrations are driven by what CPU the task
-        * is running on. If it's never scheduled on another node, it'll
-        * not migrate so why bother trapping the fault.
-        */
-       if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-               mm->first_nid = numa_node_id();
-       if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-               /* Are we running on a new node yet? */
-               if (numa_node_id() == mm->first_nid &&
-                   !sched_feat_numa(NUMA_FORCE))
-                       return;
-
-               mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+       if (!mm->numa_next_reset || !mm->numa_next_scan) {
+               mm->numa_next_scan = now +
+                       msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
+               mm->numa_next_reset = now +
+                       msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
         }
  
         /*
@@ -926,7 +1187,7 @@ void task_numa_work(struct callback_head *work)
          */
         migrate = mm->numa_next_reset;
         if (time_after(now, migrate)) {
-               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+               p->numa_scan_period = task_scan_min(p);
                 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
                 xchg(&mm->numa_next_reset, next_scan);
         }
@@ -938,20 +1199,20 @@ void task_numa_work(struct callback_head *work)
         if (time_before(now, migrate))
                 return;
  
-       if (p->numa_scan_period == 0)
-               p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+       if (p->numa_scan_period == 0) {
+               p->numa_scan_period_max = task_scan_max(p);
+               p->numa_scan_period = task_scan_min(p);
+       }
  
         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
                 return;
  
         /*
-        * Do not set pte_numa if the current running node is rate-limited.
-        * This loses statistics on the fault but if we are unwilling to
-        * migrate to this node, it is less likely we can do useful work
+        * Delay this task enough that another task of this mm will likely win
+        * the next time around.
          */
-       if (migrate_ratelimited(numa_node_id()))
-               return;
+       p->node_stamp += 2 * TICK_NSEC;
  
         start = mm->numa_scan_offset;
         pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1228,32 @@ void task_numa_work(struct callback_head *work)
                 vma = mm->mmap;
         }
         for (; vma; vma = vma->vm_next) {
-               if (!vma_migratable(vma))
+               if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
                         continue;
  
-               /* Skip small VMAs. They are not likely to be of relevance */
-               if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+               /*
+                * Shared library pages mapped by multiple processes are not
+                * migrated as it is expected they are cache replicated. Avoid
+                * hinting faults in read-only file-backed mappings or the vdso
+                * as migrating the pages will be of marginal benefit.
+                */
+               if (!vma->vm_mm ||
+                   (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                         continue;
  
                 do {
                         start = max(start, vma->vm_start);
                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                         end = min(end, vma->vm_end);
-                       pages -= change_prot_numa(vma, start, end);
+                       nr_pte_updates += change_prot_numa(vma, start, end);
+
+                       /*
+                        * Scan sysctl_numa_balancing_scan_size but ensure that
+                        * at least one PTE is updated so that unused virtual
+                        * address space is quickly skipped.
+                        */
+                       if (nr_pte_updates)
+                               pages -= (end - start) >> PAGE_SHIFT;
  
                         start = end;
                         if (pages <= 0)
@@ -988,10 +1263,22 @@ void task_numa_work(struct callback_head *work)
  
  out:
         /*
-        * It is possible to reach the end of the VMA list but the last few VMAs are
-        * not guaranteed to the vma_migratable. If they are not, we would find the
-        * !migratable VMA on the next scan but not reset the scanner to the start
-        * so check it now.
+        * If the whole process was scanned without updates then no NUMA
+        * hinting faults are being recorded and scan rate should be lower.
+        */
+       if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
+               p->numa_scan_period = min(p->numa_scan_period_max,
+                       p->numa_scan_period << 1);
+
+               next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+               mm->numa_next_scan = next_scan;
+       }
+
+       /*
+        * It is possible to reach the end of the VMA list but the last few
+        * VMAs are not guaranteed to the vma_migratable. If they are not, we
+        * would find the !migratable VMA on the next scan but not reset the
+        * scanner to the start so check it now.
          */
         if (vma)
                 mm->numa_scan_offset = start;
@@ -1025,8 +1312,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
  
         if (now - curr->node_stamp > period) {
                 if (!curr->node_stamp)
-                       curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-               curr->node_stamp = now;
+                       curr->numa_scan_period = task_scan_min(curr);
+               curr->node_stamp += period;
  
                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -3113,7 +3400,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
  
-       if (!tg->parent)        /* the trivial, non-cgroup case */
+       if (!tg->parent || !wl) /* the trivial, non-cgroup case */
                 return wl;
  
         for_each_sched_entity(se) {
@@ -3166,8 +3453,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  }
  #else
  
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
-               unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  {
         return wl;
  }
@@ -3420,11 +3706,10 @@ done:
   * preempt must be disabled.
   */
  static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
  {
         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
         int cpu = smp_processor_id();
-       int prev_cpu = task_cpu(p);
         int new_cpu = cpu;
         int want_affine = 0;
         int sync = wake_flags & WF_SYNC;
@@ -3906,7 +4191,8 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED  0x04
+#define LBF_SOME_PINNED        0x08
  
  struct lb_env {
         struct sched_domain     *sd;
@@ -3941,6 +4227,20 @@ static void move_task(struct task_struct *p, struct lb_env *env)
         set_task_cpu(p, env->dst_cpu);
         activate_task(env->dst_rq, p, 0);
         check_preempt_curr(env->dst_rq, p, 0);
+#ifdef CONFIG_NUMA_BALANCING
+       if (p->numa_preferred_nid != -1) {
+               int src_nid = cpu_to_node(env->src_cpu);
+               int dst_nid = cpu_to_node(env->dst_cpu);
+
+               /*
+                * If the load balancer has moved the task then limit
+                * migrations from taking place in the short term in
+                * case this is a short-lived migration.
+                */
+               if (src_nid != dst_nid && dst_nid != p->numa_preferred_nid)
+                       p->numa_migrate_seq = 0;
+       }
+#endif
  }
  
  /*
@@ -3975,6 +4275,69 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         return delta < (s64)sysctl_sched_migration_cost;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+           !(env->sd->flags & SD_NUMA)) {
+               return false;
+       }
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid ||
+           p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+               return false;
+
+       if (dst_nid == p->numa_preferred_nid ||
+           task_faults(p, dst_nid) > task_faults(p, src_nid))
+               return true;
+
+       return false;
+}
+
+
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+       int src_nid, dst_nid;
+
+       if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+               return false;
+
+       if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+               return false;
+
+       src_nid = cpu_to_node(env->src_cpu);
+       dst_nid = cpu_to_node(env->dst_cpu);
+
+       if (src_nid == dst_nid ||
+           p->numa_migrate_seq >= sysctl_numa_balancing_settle_count)
+               return false;
+
+       if (task_faults(p, dst_nid) < task_faults(p, src_nid))
+               return true;
+
+       return false;
+}
+
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+
+static inline bool migrate_degrades_locality(struct task_struct *p,
+                                            struct lb_env *env)
+{
+       return false;
+}
+#endif
+
  /*
   * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
   */
@@ -3997,6 +4360,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  
                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
  
+               env->flags |= LBF_SOME_PINNED;
+
                 /*
                  * Remember if this task can be migrated to any other cpu in
                  * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4370,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                  * Also avoid computing new_dst_cpu if we have already computed
                  * one in current iteration.
                  */
-               if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+               if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
                         return 0;
  
                 /* Prevent to re-select dst_cpu via env's cpus */
                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                               env->flags |= LBF_SOME_PINNED;
+                               env->flags |= LBF_DST_PINNED;
                                 env->new_dst_cpu = cpu;
                                 break;
                         }
@@ -4030,11 +4395,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
  
         /*
          * Aggressive migration if:
-        * 1) task is cache cold, or
-        * 2) too many balance attempts have failed.
+        * 1) destination numa is preferred
+        * 2) task is cache cold, or
+        * 3) too many balance attempts have failed.
          */
-
         tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+       if (!tsk_cache_hot)
+               tsk_cache_hot = migrate_degrades_locality(p, env);
+
+       if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+               if (tsk_cache_hot) {
+                       schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                       schedstat_inc(p, se.statistics.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
         if (!tsk_cache_hot ||
                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
  
@@ -4242,7 +4620,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
         }
  
         if (!se) {
-               cfs_rq->h_load = rq->avg.load_avg_contrib;
+               cfs_rq->h_load = cfs_rq->runnable_load_avg;
                 cfs_rq->last_h_load_update = now;
         }
  
@@ -4447,7 +4825,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
  {
         struct sched_domain *child = sd->child;
         struct sched_group *group, *sdg = sd->groups;
-       unsigned long power;
+       unsigned long power, power_orig;
         unsigned long interval;
  
         interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +4837,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 return;
         }
  
-       power = 0;
+       power_orig = power = 0;
  
         if (child->flags & SD_OVERLAP) {
                 /*
@@ -4467,8 +4845,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
                  * span the current group.
                  */
  
-               for_each_cpu(cpu, sched_group_cpus(sdg))
-                       power += power_of(cpu);
+               for_each_cpu(cpu, sched_group_cpus(sdg)) {
+                       struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+
+                       power_orig += sg->sgp->power_orig;
+                       power += sg->sgp->power;
+               }
         } else  {
                 /*
                  * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +4859,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
  
                 group = child->groups;
                 do {
+                       power_orig += group->sgp->power_orig;
                         power += group->sgp->power;
                         group = group->next;
                 } while (group != child->groups);
         }
  
-       sdg->sgp->power_orig = sdg->sgp->power = power;
+       sdg->sgp->power_orig = power_orig;
+       sdg->sgp->power = power;
  }
  
  /*
@@ -4526,13 +4910,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * cpu 3 and leave one of the cpus in the second group unused.
   *
   * The current solution to this issue is detecting the skew in the first group
- * by noticing it has a cpu that is overloaded while the remaining cpus are
- * idle -- or rather, there's a distinct imbalance in the cpus; see
- * sg_imbalanced().
+ * by noticing the lower domain failed to reach balance and had difficulty
+ * moving tasks due to affinity constraints.
   *
   * When this is so detected; this group becomes a candidate for busiest; see
   * update_sd_pick_busiest(). And calculcate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
   * to create an effective group imbalance.
   *
   * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +4923,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
   * subtle and fragile situation.
   */
  
-struct sg_imb_stats {
-       unsigned long max_nr_running, min_nr_running;
-       unsigned long max_cpu_load, min_cpu_load;
-};
-
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
+static inline int sg_imbalanced(struct sched_group *group)
  {
-       sgi->max_cpu_load = sgi->max_nr_running = 0UL;
-       sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
+       return group->sgp->imbalance;
  }
  
-static inline void
-update_sg_imb_stats(struct sg_imb_stats *sgi,
-                   unsigned long load, unsigned long nr_running)
+/*
+ * Compute the group capacity.
+ *
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit power unit capacity with that.
+ */
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
  {
-       if (load > sgi->max_cpu_load)
-               sgi->max_cpu_load = load;
-       if (sgi->min_cpu_load > load)
-               sgi->min_cpu_load = load;
+       unsigned int capacity, smt, cpus;
+       unsigned int power, power_orig;
  
-       if (nr_running > sgi->max_nr_running)
-               sgi->max_nr_running = nr_running;
-       if (sgi->min_nr_running > nr_running)
-               sgi->min_nr_running = nr_running;
-}
+       power = group->sgp->power;
+       power_orig = group->sgp->power_orig;
+       cpus = group->group_weight;
  
-static inline int
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
-{
-       /*
-        * Consider the group unbalanced when the imbalance is larger
-        * than the average weight of a task.
-        *
-        * APZ: with cgroup the avg task weight can vary wildly and
-        *      might not be a suitable number - should we keep a
-        *      normalized nr_running number somewhere that negates
-        *      the hierarchy?
-        */
-       if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
-           (sgi->max_nr_running - sgi->min_nr_running) > 1)
-               return 1;
+       /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
+       smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
+       capacity = cpus / smt; /* cores */
  
-       return 0;
+       capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+       if (!capacity)
+               capacity = fix_small_capacity(env->sd, group);
+
+       return capacity;
  }
  
  /**
@@ -4597,12 +4967,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
                         int local_group, struct sg_lb_stats *sgs)
  {
-       struct sg_imb_stats sgi;
         unsigned long nr_running;
         unsigned long load;
         int i;
  
-       init_sg_imb_stats(&sgi);
+       memset(sgs, 0, sizeof(*sgs));
  
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                 struct rq *rq = cpu_rq(i);
@@ -4610,12 +4979,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 nr_running = rq->nr_running;
  
                 /* Bias balancing toward cpus of our domain */
-               if (local_group) {
+               if (local_group)
                         load = target_load(i, load_idx);
-               } else {
+               else
                         load = source_load(i, load_idx);
-                       update_sg_imb_stats(&sgi, load, nr_running);
-               }
  
                 sgs->group_load += load;
                 sgs->sum_nr_running += nr_running;
@@ -4624,10 +4991,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         sgs->idle_cpus++;
         }
  
-       if (local_group && (env->idle != CPU_NEWLY_IDLE ||
-                       time_after_eq(jiffies, group->sgp->next_update)))
-               update_group_power(env->sd, env->dst_cpu);
-
         /* Adjust by relative CPU power of the group */
         sgs->group_power = group->sgp->power;
         sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +4998,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
         if (sgs->sum_nr_running)
                 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
  
-       sgs->group_imb = sg_imbalanced(sgs, &sgi);
-
-       sgs->group_capacity =
-               DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
-
-       if (!sgs->group_capacity)
-               sgs->group_capacity = fix_small_capacity(env->sd, group);
-
         sgs->group_weight = group->group_weight;
  
+       sgs->group_imb = sg_imbalanced(group);
+       sgs->group_capacity = sg_capacity(env, group);
+
         if (sgs->group_capacity > sgs->sum_nr_running)
                 sgs->group_has_capacity = 1;
  }
@@ -4720,11 +5078,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                 if (local_group) {
                         sds->local = sg;
                         sgs = &sds->local_stat;
+
+                       if (env->idle != CPU_NEWLY_IDLE ||
+                           time_after_eq(jiffies, sg->sgp->next_update))
+                               update_group_power(env->sd, env->dst_cpu);
                 }
  
-               memset(sgs, 0, sizeof(*sgs));
                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
  
+               if (local_group)
+                       goto next_group;
+
                 /*
                  * In case the child domain prefers tasks go to siblings
                  * first, lower the sg capacity to one so that we'll try
@@ -4735,19 +5099,20 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                  * heaviest group when it is already under-utilized (possible
                  * with a large weight task outweighs the tasks on the system).
                  */
-               if (prefer_sibling && !local_group &&
-                               sds->local && sds->local_stat.group_has_capacity)
+               if (prefer_sibling && sds->local &&
+                   sds->local_stat.group_has_capacity)
                         sgs->group_capacity = min(sgs->group_capacity, 1U);
  
-               /* Now, start updating sd_lb_stats */
-               sds->total_load += sgs->group_load;
-               sds->total_pwr += sgs->group_power;
-
-               if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
+               if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                         sds->busiest = sg;
                         sds->busiest_stat = *sgs;
                 }
  
+next_group:
+               /* Now, start updating sd_lb_stats */
+               sds->total_load += sgs->group_load;
+               sds->total_pwr += sgs->group_power;
+
                 sg = sg->next;
         } while (sg != env->sd->groups);
  }
@@ -4823,8 +5188,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
                 (busiest->load_per_task * SCHED_POWER_SCALE) /
                 busiest->group_power;
  
-       if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
-           (scaled_busy_load_per_task * imbn)) {
+       if (busiest->avg_load + scaled_busy_load_per_task >=
+           local->avg_load + (scaled_busy_load_per_task * imbn)) {
                 env->imbalance = busiest->load_per_task;
                 return;
         }
@@ -4896,7 +5261,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
          * max load less than avg load(as we skip the groups at or below
          * its cpu_power, while calculating max_load..)
          */
-       if (busiest->avg_load < sds->avg_load) {
+       if (busiest->avg_load <= sds->avg_load ||
+           local->avg_load >= sds->avg_load) {
                 env->imbalance = 0;
                 return fix_small_imbalance(env, sds);
         }
@@ -5163,6 +5529,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         int *continue_balancing)
  {
         int ld_moved, cur_ld_moved, active_balance = 0;
+       struct sched_domain *sd_parent = sd->parent;
         struct sched_group *group;
         struct rq *busiest;
         unsigned long flags;
@@ -5267,17 +5634,17 @@ more_balance:
                  * moreover subsequent load balance cycles should correct the
                  * excess load moved.
                  */
-               if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+               if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+
+                       /* Prevent to re-select dst_cpu via env's cpus */
+                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
  
                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
                         env.dst_cpu      = env.new_dst_cpu;
-                       env.flags       &= ~LBF_SOME_PINNED;
+                       env.flags       &= ~LBF_DST_PINNED;
                         env.loop         = 0;
                         env.loop_break   = sched_nr_migrate_break;
  
-                       /* Prevent to re-select dst_cpu via env's cpus */
-                       cpumask_clear_cpu(env.dst_cpu, env.cpus);
-
                         /*
                          * Go back to "more_balance" rather than "redo" since we
                          * need to continue with same src_cpu.
@@ -5285,6 +5652,18 @@ more_balance:
                         goto more_balance;
                 }
  
+               /*
+                * We failed to reach balance because of affinity.
+                */
+               if (sd_parent) {
+                       int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+
+                       if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                               *group_imbalance = 1;
+                       } else if (*group_imbalance)
+                               *group_imbalance = 0;
+               }
+
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
                         cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5392,6 +5771,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
+       u64 curr_cost = 0;
  
         this_rq->idle_stamp = rq_clock(this_rq);
  
@@ -5408,15 +5788,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
                 int continue_balancing = 1;
+               u64 t0, domain_cost;
  
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
  
+               if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                       break;
+
                 if (sd->flags & SD_BALANCE_NEWIDLE) {
+                       t0 = sched_clock_cpu(this_cpu);
+
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance(this_cpu, this_rq,
                                                    sd, CPU_NEWLY_IDLE,
                                                    &continue_balancing);
+
+                       domain_cost = sched_clock_cpu(this_cpu) - t0;
+                       if (domain_cost > sd->max_newidle_lb_cost)
+                               sd->max_newidle_lb_cost = domain_cost;
+
+                       curr_cost += domain_cost;
                 }
  
                 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5438,6 +5830,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                  */
                 this_rq->next_balance = next_balance;
         }
+
+       if (curr_cost > this_rq->max_idle_balance_cost)
+               this_rq->max_idle_balance_cost = curr_cost;
  }
  
  /*
@@ -5661,15 +6056,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
         /* Earliest time when we have to do rebalance again */
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
-       int need_serialize;
+       int need_serialize, need_decay = 0;
+       u64 max_cost = 0;
  
         update_blocked_averages(cpu);
  
         rcu_read_lock();
         for_each_domain(cpu, sd) {
+               /*
+                * Decay the newidle max times here because this is a regular
+                * visit to all the domains. Decay ~1% per second.
+                */
+               if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                       sd->max_newidle_lb_cost =
+                               (sd->max_newidle_lb_cost * 253) / 256;
+                       sd->next_decay_max_lb_cost = jiffies + HZ;
+                       need_decay = 1;
+               }
+               max_cost += sd->max_newidle_lb_cost;
+
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
  
+               /*
+                * Stop the load balance at this level. There is another
+                * CPU in our sched group which is doing load balancing more
+                * actively.
+                */
+               if (!continue_balancing) {
+                       if (need_decay)
+                               continue;
+                       break;
+               }
+
                 interval = sd->balance_interval;
                 if (idle != CPU_IDLE)
                         interval *= sd->busy_factor;
@@ -5688,7 +6107,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
                         if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                 /*
-                                * The LBF_SOME_PINNED logic could have changed
+                                * The LBF_DST_PINNED logic could have changed
                                  * env->dst_cpu, so we can't know our idle
                                  * state even if we migrated tasks. Update it.
                                  */
@@ -5703,14 +6122,14 @@ out:
                         next_balance = sd->last_balance + interval;
                         update_next_balance = 1;
                 }
-
+       }
+       if (need_decay) {
                 /*
-                * Stop the load balance at this level. There is another
-                * CPU in our sched group which is doing load balancing more
-                * actively.
+                * Ensure the rq-wide value also decays but keep it at a
+                * reasonable floor to avoid funnies with rq->avg_idle.
                  */
-               if (!continue_balancing)
-                       break;
+               rq->max_idle_balance_cost =
+                       max((u64)sysctl_sched_migration_cost, max_cost);
         }
         rcu_read_unlock();