sched: Fix race in migrate_swap_stop()

[~andy/linux] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 5ac63c9a995a3570e0ad73a20b28c23cb972a963..a972acd468b0838e2d08a4103db2c7e5476a2d6c 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
   * might also involve a cross-CPU call to trigger the scheduler on
   * the target CPU.
   */
-#ifdef CONFIG_SMP
  void resched_task(struct task_struct *p)
  {
         int cpu;
  
-       assert_raw_spin_locked(&task_rq(p)->lock);
+       lockdep_assert_held(&task_rq(p)->lock);
  
         if (test_tsk_need_resched(p))
                 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
         set_tsk_need_resched(p);
  
         cpu = task_cpu(p);
-       if (cpu == smp_processor_id())
+       if (cpu == smp_processor_id()) {
+               set_preempt_need_resched();
                 return;
+       }
  
         /* NEED_RESCHED must be visible before we test polling */
         smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  }
  
+#ifdef CONFIG_SMP
  #ifdef CONFIG_NO_HZ_COMMON
  /*
   * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
         }
  }
  
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-       assert_raw_spin_locked(&task_rq(p)->lock);
-       set_tsk_need_resched(p);
-}
  #endif /* CONFIG_SMP */
  
  #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
  static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_queued(p);
+       sched_info_queued(rq, p);
         p->sched_class->enqueue_task(rq, p, flags);
  }
  
  static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
  {
         update_rq_clock(rq);
-       sched_info_dequeued(p);
+       sched_info_dequeued(rq, p);
         p->sched_class->dequeue_task(rq, p, flags);
  }
  
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
          * ttwu() will sort out the placement.
          */
         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                       !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+                       !(task_preempt_count(p) & PREEMPT_ACTIVE));
  
  #ifdef CONFIG_LOCKDEP
         /*
@@ -1017,6 +1013,106 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         __set_task_cpu(p, new_cpu);
  }
  
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+       if (p->on_rq) {
+               struct rq *src_rq, *dst_rq;
+
+               src_rq = task_rq(p);
+               dst_rq = cpu_rq(cpu);
+
+               deactivate_task(src_rq, p, 0);
+               set_task_cpu(p, cpu);
+               activate_task(dst_rq, p, 0);
+               check_preempt_curr(dst_rq, p, 0);
+       } else {
+               /*
+                * Task isn't running anymore; make it appear like we migrated
+                * it before it went to sleep. This means on wakeup we make the
+                * previous cpu our targer instead of where it really is.
+                */
+               p->wake_cpu = cpu;
+       }
+}
+
+struct migration_swap_arg {
+       struct task_struct *src_task, *dst_task;
+       int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+       struct migration_swap_arg *arg = data;
+       struct rq *src_rq, *dst_rq;
+       int ret = -EAGAIN;
+
+       src_rq = cpu_rq(arg->src_cpu);
+       dst_rq = cpu_rq(arg->dst_cpu);
+
+       double_raw_lock(&arg->src_task->pi_lock,
+                       &arg->dst_task->pi_lock);
+       double_rq_lock(src_rq, dst_rq);
+       if (task_cpu(arg->dst_task) != arg->dst_cpu)
+               goto unlock;
+
+       if (task_cpu(arg->src_task) != arg->src_cpu)
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+               goto unlock;
+
+       __migrate_swap_task(arg->src_task, arg->dst_cpu);
+       __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+       ret = 0;
+
+unlock:
+       double_rq_unlock(src_rq, dst_rq);
+       raw_spin_unlock(&arg->dst_task->pi_lock);
+       raw_spin_unlock(&arg->src_task->pi_lock);
+
+       return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+       struct migration_swap_arg arg;
+       int ret = -EINVAL;
+
+       get_online_cpus();
+
+       arg = (struct migration_swap_arg){
+               .src_task = cur,
+               .src_cpu = task_cpu(cur),
+               .dst_task = p,
+               .dst_cpu = task_cpu(p),
+       };
+
+       if (arg.src_cpu == arg.dst_cpu)
+               goto out;
+
+       if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+               goto out;
+
+       ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+       put_online_cpus();
+       return ret;
+}
+
  struct migration_arg {
         struct task_struct *task;
         int dest_cpu;
@@ -1236,9 +1332,9 @@ out:
   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
   */
  static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
  {
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+       cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
  
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1426,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
  
         if (rq->idle_stamp) {
                 u64 delta = rq_clock(rq) - rq->idle_stamp;
-               u64 max = 2*sysctl_sched_migration_cost;
+               u64 max = 2*rq->max_idle_balance_cost;
  
-               if (delta > max)
+               update_avg(&rq->avg_idle, delta);
+
+               if (rq->avg_idle > max)
                         rq->avg_idle = max;
-               else
-                       update_avg(&rq->avg_idle, delta);
+
                 rq->idle_stamp = 0;
         }
  #endif
@@ -1396,6 +1493,14 @@ static void sched_ttwu_pending(void)
  
  void scheduler_ipi(void)
  {
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       if (tif_need_resched())
+               set_preempt_need_resched();
+
         if (llist_empty(&this_rq()->wake_list)
                         && !tick_nohz_full_cpu(smp_processor_id())
                         && !got_nohz_idle_kick())
@@ -1513,7 +1618,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         if (p->sched_class->task_waking)
                 p->sched_class->task_waking(p);
  
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
@@ -1595,7 +1700,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
   *
   * __sched_fork() is basic setup used by init_idle() too:
   */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
         p->on_rq                        = 0;
  
@@ -1619,16 +1724,24 @@ static void __sched_fork(struct task_struct *p)
  
  #ifdef CONFIG_NUMA_BALANCING
         if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-               p->mm->numa_next_scan = jiffies;
-               p->mm->numa_next_reset = jiffies;
+               p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
                 p->mm->numa_scan_seq = 0;
         }
  
+       if (clone_flags & CLONE_VM)
+               p->numa_preferred_nid = current->numa_preferred_nid;
+       else
+               p->numa_preferred_nid = -1;
+
         p->node_stamp = 0ULL;
         p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
         p->numa_scan_period = sysctl_numa_balancing_scan_delay;
         p->numa_work.next = &p->numa_work;
+       p->numa_faults = NULL;
+       p->numa_faults_buffer = NULL;
+
+       INIT_LIST_HEAD(&p->numa_entry);
+       p->numa_group = NULL;
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
@@ -1654,12 +1767,12 @@ void set_numabalancing_state(bool enabled)
  /*
   * fork()/clone()-time setup:
   */
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
         unsigned long flags;
         int cpu = get_cpu();
  
-       __sched_fork(p);
+       __sched_fork(clone_flags, p);
         /*
          * We mark the process as running here. This guarantees that
          * nobody will actually run it, and a signal or other external
@@ -1717,10 +1830,7 @@ void sched_fork(struct task_struct *p)
  #if defined(CONFIG_SMP)
         p->on_cpu = 0;
  #endif
-#ifdef CONFIG_PREEMPT_COUNT
-       /* Want to start with kernel preemption disabled. */
-       task_thread_info(p)->preempt_count = 1;
-#endif
+       init_task_preempt_count(p);
  #ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
  #endif
@@ -1747,7 +1857,7 @@ void wake_up_new_task(struct task_struct *p)
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
          */
-       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
         /* Initialize new task's runnable average */
@@ -1838,7 +1948,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                     struct task_struct *next)
  {
         trace_sched_switch(prev, next);
-       sched_info_switch(prev, next);
+       sched_info_switch(rq, prev, next);
         perf_event_task_sched_out(prev, next);
         fire_sched_out_preempt_notifiers(prev, next);
         prepare_lock_switch(rq, next);
@@ -1890,6 +2000,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         if (mm)
                 mmdrop(mm);
         if (unlikely(prev_state == TASK_DEAD)) {
+               task_numa_free(prev);
+
                 /*
                  * Remove function-return probe instances associated with this
                  * task and put them back on the free list.
@@ -2073,7 +2185,7 @@ void sched_exec(void)
         int dest_cpu;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
  
@@ -2215,7 +2327,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
  #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                 defined(CONFIG_PREEMPT_TRACER))
  
-void __kprobes add_preempt_count(int val)
+void __kprobes preempt_count_add(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
         /*
@@ -2224,7 +2336,7 @@ void __kprobes add_preempt_count(int val)
         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                 return;
  #endif
-       preempt_count() += val;
+       __preempt_count_add(val);
  #ifdef CONFIG_DEBUG_PREEMPT
         /*
          * Spinlock count overflowing soon?
@@ -2235,9 +2347,9 @@ void __kprobes add_preempt_count(int val)
         if (preempt_count() == val)
                 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
  }
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
  
-void __kprobes sub_preempt_count(int val)
+void __kprobes preempt_count_sub(int val)
  {
  #ifdef CONFIG_DEBUG_PREEMPT
         /*
@@ -2255,9 +2367,9 @@ void __kprobes sub_preempt_count(int val)
  
         if (preempt_count() == val)
                 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-       preempt_count() -= val;
+       __preempt_count_sub(val);
  }
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
  
  #endif
  
@@ -2430,6 +2542,7 @@ need_resched:
         put_prev_task(rq, prev);
         next = pick_next_task(rq);
         clear_tsk_need_resched(prev);
+       clear_preempt_need_resched();
         rq->skip_clock_update = 0;
  
         if (likely(prev != next)) {
@@ -2520,9 +2633,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
                 return;
  
         do {
-               add_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_add(PREEMPT_ACTIVE);
                 __schedule();
-               sub_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_sub(PREEMPT_ACTIVE);
  
                 /*
                  * Check again in case we missed a preemption opportunity
@@ -2541,20 +2654,19 @@ EXPORT_SYMBOL(preempt_schedule);
   */
  asmlinkage void __sched preempt_schedule_irq(void)
  {
-       struct thread_info *ti = current_thread_info();
         enum ctx_state prev_state;
  
         /* Catch callers which need to be fixed */
-       BUG_ON(ti->preempt_count || !irqs_disabled());
+       BUG_ON(preempt_count() || !irqs_disabled());
  
         prev_state = exception_enter();
  
         do {
-               add_preempt_count(PREEMPT_ACTIVE);
+               __preempt_count_add(PREEMPT_ACTIVE);
                 local_irq_enable();
                 __schedule();
                 local_irq_disable();
-               sub_preempt_count(PREEMPT_ACTIVE);
+               __preempt_count_sub(PREEMPT_ACTIVE);
  
                 /*
                  * Check again in case we missed a preemption opportunity
@@ -3794,16 +3906,11 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
-static inline int should_resched(void)
-{
-       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
  static void __cond_resched(void)
  {
-       add_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_add(PREEMPT_ACTIVE);
         __schedule();
-       sub_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_sub(PREEMPT_ACTIVE);
  }
  
  int __sched _cond_resched(void)
@@ -4186,7 +4293,7 @@ void init_idle(struct task_struct *idle, int cpu)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
  
-       __sched_fork(idle);
+       __sched_fork(0, idle);
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
  
@@ -4212,7 +4319,7 @@ void init_idle(struct task_struct *idle, int cpu)
         raw_spin_unlock_irqrestore(&rq->lock, flags);
  
         /* Set the preempt count _outside_ the spinlocks! */
-       task_thread_info(idle)->preempt_count = 0;
+       init_idle_preempt_count(idle, cpu);
  
         /*
          * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4453,53 @@ fail:
         return ret;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+       struct migration_arg arg = { p, target_cpu };
+       int curr_cpu = task_cpu(p);
+
+       if (curr_cpu == target_cpu)
+               return 0;
+
+       if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+               return -EINVAL;
+
+       /* TODO: This is not properly updating schedstats */
+
+       return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+       struct rq *rq;
+       unsigned long flags;
+       bool on_rq, running;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->numa_preferred_nid = nid;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+#endif
+
  /*
   * migration_cpu_stop - this will be executed by a highprio stopper thread
   * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +5273,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_size);
  DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
  
  static void update_top_cache_domain(int cpu)
  {
@@ -5135,6 +5290,9 @@ static void update_top_cache_domain(int cpu)
         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
         per_cpu(sd_llc_size, cpu) = size;
         per_cpu(sd_llc_id, cpu) = id;
+
+       sd = lowest_flag_domain(cpu, SD_NUMA);
+       rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
  }
  
  /*
@@ -5654,6 +5812,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                         | 0*SD_SHARE_PKG_RESOURCES
                                         | 1*SD_SERIALIZE
                                         | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                         | sd_local_flags(level)
                                         ,
                 .last_balance           = jiffies,
@@ -6505,6 +6664,7 @@ void __init sched_init(void)
                 rq->online = 0;
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
+               rq->max_idle_balance_cost = sysctl_sched_migration_cost;
  
                 INIT_LIST_HEAD(&rq->cfs_tasks);