]> Pileus Git - ~andy/linux/blobdiff - kernel/sched/fair.c
sched/numa: Cure update_numa_stats() vs. hotplug
[~andy/linux] / kernel / sched / fair.c
index df77c605c7a6e389a7a15605cc45067584132193..201be782b5b3cae8be30bd01783fc217dbee4944 100644 (file)
@@ -1000,7 +1000,7 @@ struct numa_stats {
  */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-       int cpu;
+       int cpu, cpus = 0;
 
        memset(ns, 0, sizeof(*ns));
        for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1009,8 +1009,21 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
                ns->nr_running += rq->nr_running;
                ns->load += weighted_cpuload(cpu);
                ns->power += power_of(cpu);
+
+               cpus++;
        }
 
+       /*
+        * If we raced with hotplug and there are no CPUs left in our mask
+        * the @ns structure is NULL'ed and task_numa_compare() will
+        * not find this node attractive.
+        *
+        * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+        * and bail there.
+        */
+       if (!cpus)
+               return;
+
        ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
        ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
        ns->has_capacity = (ns->nr_running < ns->capacity);
@@ -1201,9 +1214,21 @@ static int task_numa_migrate(struct task_struct *p)
         */
        rcu_read_lock();
        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-       env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+       if (sd)
+               env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
        rcu_read_unlock();
 
+       /*
+        * Cpusets can break the scheduler domain tree into smaller
+        * balance domains, some of which do not cross NUMA boundaries.
+        * Tasks that are "trapped" in such domains cannot be migrated
+        * elsewhere, so there is no point in (re)trying.
+        */
+       if (unlikely(!sd)) {
+               p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+               return -EINVAL;
+       }
+
        taskweight = task_weight(p, env.src_nid);
        groupweight = group_weight(p, env.src_nid);
        update_numa_stats(&env.src_stats, env.src_nid);