sched/numa: Cure update_numa_stats() vs. hotplug

[~andy/linux] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index df77c605c7a6e389a7a15605cc45067584132193..201be782b5b3cae8be30bd01783fc217dbee4944 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1000,7 +1000,7 @@ struct numa_stats {
   */
  static void update_numa_stats(struct numa_stats *ns, int nid)
  {
-       int cpu;
+       int cpu, cpus = 0;
  
         memset(ns, 0, sizeof(*ns));
         for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
                 ns->nr_running += rq->nr_running;
                 ns->load += weighted_cpuload(cpu);
                 ns->power += power_of(cpu);
+
+               cpus++;
         }
  
+       /*
+        * If we raced with hotplug and there are no CPUs left in our mask
+        * the @ns structure is NULL'ed and task_numa_compare() will
+        * not find this node attractive.
+        *
+        * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+        * and bail there.
+        */
+       if (!cpus)
+               return;
+
         ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
         ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
         ns->has_capacity = (ns->nr_running < ns->capacity);
@@ -1201,9 +1214,21 @@ static int task_numa_migrate(struct task_struct *p)
          */
         rcu_read_lock();
         sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-       env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+       if (sd)
+               env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
         rcu_read_unlock();
  
+       /*
+        * Cpusets can break the scheduler domain tree into smaller
+        * balance domains, some of which do not cross NUMA boundaries.
+        * Tasks that are "trapped" in such domains cannot be migrated
+        * elsewhere, so there is no point in (re)trying.
+        */
+       if (unlikely(!sd)) {
+               p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+               return -EINVAL;
+       }
+
         taskweight = task_weight(p, env.src_nid);
         groupweight = group_weight(p, env.src_nid);
         update_numa_stats(&env.src_stats, env.src_nid);