]> Pileus Git - ~andy/linux/blobdiff - kernel/sched/core.c
sched: Fix race in migrate_swap_stop()
[~andy/linux] / kernel / sched / core.c
index 5ac63c9a995a3570e0ad73a20b28c23cb972a963..a972acd468b0838e2d08a4103db2c7e5476a2d6c 100644 (file)
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
  * might also involve a cross-CPU call to trigger the scheduler on
  * the target CPU.
  */
-#ifdef CONFIG_SMP
 void resched_task(struct task_struct *p)
 {
        int cpu;
 
-       assert_raw_spin_locked(&task_rq(p)->lock);
+       lockdep_assert_held(&task_rq(p)->lock);
 
        if (test_tsk_need_resched(p))
                return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
        set_tsk_need_resched(p);
 
        cpu = task_cpu(p);
-       if (cpu == smp_processor_id())
+       if (cpu == smp_processor_id()) {
+               set_preempt_need_resched();
                return;
+       }
 
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
+#ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
        }
 }
 
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-       assert_raw_spin_locked(&task_rq(p)->lock);
-       set_tsk_need_resched(p);
-}
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-       sched_info_queued(p);
+       sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-       sched_info_dequeued(p);
+       sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
 
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * ttwu() will sort out the placement.
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                       !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+                       !(task_preempt_count(p) & PREEMPT_ACTIVE));
 
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1017,6 +1013,106 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
 
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+       if (p->on_rq) {
+               struct rq *src_rq, *dst_rq;
+
+               src_rq = task_rq(p);
+               dst_rq = cpu_rq(cpu);
+
+               deactivate_task(src_rq, p, 0);
+               set_task_cpu(p, cpu);
+               activate_task(dst_rq, p, 0);
+               check_preempt_curr(dst_rq, p, 0);
+       } else {
+               /*
+                * Task isn't running anymore; make it appear like we migrated
+                * it before it went to sleep. This means on wakeup we make the
+                * previous cpu our targer instead of where it really is.
+                */
+               p->wake_cpu = cpu;
+       }
+}
+
+struct migration_swap_arg {
+       struct task_struct *src_task, *dst_task;
+       int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+       struct migration_swap_arg *arg = data;
+       struct rq *src_rq, *dst_rq;
+       int ret = -EAGAIN;
+
+       src_rq = cpu_rq(arg->src_cpu);
+       dst_rq = cpu_rq(arg->dst_cpu);
+
+       double_raw_lock(&arg->src_task->pi_lock,
+                       &arg->dst_task->pi_lock);
+       double_rq_lock(src_rq, dst_rq);
+       if (task_cpu(arg->dst_task) != arg->dst_cpu)
+               goto unlock;
+
+       if (task_cpu(arg->src_task) != arg->src_cpu)
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+               goto unlock;
+
+       __migrate_swap_task(arg->src_task, arg->dst_cpu);
+       __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+       ret = 0;
+
+unlock:
+       double_rq_unlock(src_rq, dst_rq);
+       raw_spin_unlock(&arg->dst_task->pi_lock);
+       raw_spin_unlock(&arg->src_task->pi_lock);
+
+       return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+       struct migration_swap_arg arg;
+       int ret = -EINVAL;
+
+       get_online_cpus();
+
+       arg = (struct migration_swap_arg){
+               .src_task = cur,
+               .src_cpu = task_cpu(cur),
+               .dst_task = p,
+               .dst_cpu = task_cpu(p),
+       };
+
+       if (arg.src_cpu == arg.dst_cpu)
+               goto out;
+
+       if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+               goto out;
+
+       ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+       put_online_cpus();
+       return ret;
+}
+
 struct migration_arg {
        struct task_struct *task;
        int dest_cpu;
@@ -1236,9 +1332,9 @@ out:
  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
  */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+       cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
 
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1426,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 
        if (rq->idle_stamp) {
                u64 delta = rq_clock(rq) - rq->idle_stamp;
-               u64 max = 2*sysctl_sched_migration_cost;
+               u64 max = 2*rq->max_idle_balance_cost;
 
-               if (delta > max)
+               update_avg(&rq->avg_idle, delta);
+
+               if (rq->avg_idle > max)
                        rq->avg_idle = max;
-               else
-                       update_avg(&rq->avg_idle, delta);
+
                rq->idle_stamp = 0;
        }
 #endif
@@ -1396,6 +1493,14 @@ static void sched_ttwu_pending(void)
 
 void scheduler_ipi(void)
 {
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       if (tif_need_resched())
+               set_preempt_need_resched();
+
        if (llist_empty(&this_rq()->wake_list)
                        && !tick_nohz_full_cpu(smp_processor_id())
                        && !got_nohz_idle_kick())
@@ -1513,7 +1618,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (p->sched_class->task_waking)
                p->sched_class->task_waking(p);
 
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
@@ -1595,7 +1700,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  *
  * __sched_fork() is basic setup used by init_idle() too:
  */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        p->on_rq                        = 0;
 
@@ -1619,16 +1724,24 @@ static void __sched_fork(struct task_struct *p)
 
 #ifdef CONFIG_NUMA_BALANCING
        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-               p->mm->numa_next_scan = jiffies;
-               p->mm->numa_next_reset = jiffies;
+               p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
                p->mm->numa_scan_seq = 0;
        }
 
+       if (clone_flags & CLONE_VM)
+               p->numa_preferred_nid = current->numa_preferred_nid;
+       else
+               p->numa_preferred_nid = -1;
+
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
+       p->numa_faults = NULL;
+       p->numa_faults_buffer = NULL;
+
+       INIT_LIST_HEAD(&p->numa_entry);
+       p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
 
@@ -1654,12 +1767,12 @@ void set_numabalancing_state(bool enabled)
 /*
  * fork()/clone()-time setup:
  */
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
 
-       __sched_fork(p);
+       __sched_fork(clone_flags, p);
        /*
         * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
@@ -1717,10 +1830,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT_COUNT
-       /* Want to start with kernel preemption disabled. */
-       task_thread_info(p)->preempt_count = 1;
-#endif
+       init_task_preempt_count(p);
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
 #endif
@@ -1747,7 +1857,7 @@ void wake_up_new_task(struct task_struct *p)
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         */
-       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
 
        /* Initialize new task's runnable average */
@@ -1838,7 +1948,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
        trace_sched_switch(prev, next);
-       sched_info_switch(prev, next);
+       sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
@@ -1890,6 +2000,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
+               task_numa_free(prev);
+
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -2073,7 +2185,7 @@ void sched_exec(void)
        int dest_cpu;
 
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
 
@@ -2215,7 +2327,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
 
-void __kprobes add_preempt_count(int val)
+void __kprobes preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2224,7 +2336,7 @@ void __kprobes add_preempt_count(int val)
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
 #endif
-       preempt_count() += val;
+       __preempt_count_add(val);
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
@@ -2235,9 +2347,9 @@ void __kprobes add_preempt_count(int val)
        if (preempt_count() == val)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
 
-void __kprobes sub_preempt_count(int val)
+void __kprobes preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2255,9 +2367,9 @@ void __kprobes sub_preempt_count(int val)
 
        if (preempt_count() == val)
                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-       preempt_count() -= val;
+       __preempt_count_sub(val);
 }
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
 
 #endif
 
@@ -2430,6 +2542,7 @@ need_resched:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
+       clear_preempt_need_resched();
        rq->skip_clock_update = 0;
 
        if (likely(prev != next)) {
@@ -2520,9 +2633,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
                return;
 
        do {
-               add_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_add(PREEMPT_ACTIVE);
                __schedule();
-               sub_preempt_count_notrace(PREEMPT_ACTIVE);
+               __preempt_count_sub(PREEMPT_ACTIVE);
 
                /*
                 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2654,19 @@ EXPORT_SYMBOL(preempt_schedule);
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-       struct thread_info *ti = current_thread_info();
        enum ctx_state prev_state;
 
        /* Catch callers which need to be fixed */
-       BUG_ON(ti->preempt_count || !irqs_disabled());
+       BUG_ON(preempt_count() || !irqs_disabled());
 
        prev_state = exception_enter();
 
        do {
-               add_preempt_count(PREEMPT_ACTIVE);
+               __preempt_count_add(PREEMPT_ACTIVE);
                local_irq_enable();
                __schedule();
                local_irq_disable();
-               sub_preempt_count(PREEMPT_ACTIVE);
+               __preempt_count_sub(PREEMPT_ACTIVE);
 
                /*
                 * Check again in case we missed a preemption opportunity
@@ -3794,16 +3906,11 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
 
-static inline int should_resched(void)
-{
-       return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
-
 static void __cond_resched(void)
 {
-       add_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_add(PREEMPT_ACTIVE);
        __schedule();
-       sub_preempt_count(PREEMPT_ACTIVE);
+       __preempt_count_sub(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
@@ -4186,7 +4293,7 @@ void init_idle(struct task_struct *idle, int cpu)
 
        raw_spin_lock_irqsave(&rq->lock, flags);
 
-       __sched_fork(idle);
+       __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
 
@@ -4212,7 +4319,7 @@ void init_idle(struct task_struct *idle, int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
-       task_thread_info(idle)->preempt_count = 0;
+       init_idle_preempt_count(idle, cpu);
 
        /*
         * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4453,53 @@ fail:
        return ret;
 }
 
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+       struct migration_arg arg = { p, target_cpu };
+       int curr_cpu = task_cpu(p);
+
+       if (curr_cpu == target_cpu)
+               return 0;
+
+       if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+               return -EINVAL;
+
+       /* TODO: This is not properly updating schedstats */
+
+       return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+       struct rq *rq;
+       unsigned long flags;
+       bool on_rq, running;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->numa_preferred_nid = nid;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+#endif
+
 /*
  * migration_cpu_stop - this will be executed by a highprio stopper thread
  * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +5273,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 
 static void update_top_cache_domain(int cpu)
 {
@@ -5135,6 +5290,9 @@ static void update_top_cache_domain(int cpu)
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+
+       sd = lowest_flag_domain(cpu, SD_NUMA);
+       rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 }
 
 /*
@@ -5654,6 +5812,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 1*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                        | sd_local_flags(level)
                                        ,
                .last_balance           = jiffies,
@@ -6505,6 +6664,7 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+               rq->max_idle_balance_cost = sysctl_sched_migration_cost;
 
                INIT_LIST_HEAD(&rq->cfs_tasks);