Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[~andy/linux] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 79e611cd83dd4e44be1e6bcbf9a850dd787ea99e..27125e41357620a5efe2cebf286bbc756fdfcdb2 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
          * 'curr' points to currently running entity on this cfs_rq.
          * It is set to NULL otherwise (i.e when none are currently running).
          */
-       struct sched_entity *curr, *next, *last;
+       struct sched_entity *curr, *next, *last, *skip;
  
         unsigned int nr_spread_over;
  
@@ -1683,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                 __release(rq2->lock);
  }
  
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+       __acquires(rq1->lock)
+       __acquires(rq2->lock)
+{
+       BUG_ON(!irqs_disabled());
+       BUG_ON(rq1 != rq2);
+       raw_spin_lock(&rq1->lock);
+       __acquire(rq2->lock);   /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+       __releases(rq1->lock)
+       __releases(rq2->lock)
+{
+       BUG_ON(rq1 != rq2);
+       raw_spin_unlock(&rq1->lock);
+       __release(rq2->lock);
+}
+
  #endif
  
  static void calc_load_account_idle(struct rq *this_rq);
@@ -1877,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr)
          */
         if (hardirq_count())
                 __this_cpu_add(cpu_hardirq_time, delta);
-       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                 __this_cpu_add(cpu_softirq_time, delta);
  
         irq_time_write_end();
@@ -1917,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
                 sched_rt_avg_update(rq, irq_delta);
  }
  
+static int irqtime_account_hi_update(void)
+{
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_hardirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       unsigned long flags;
+       u64 latest_ns;
+       int ret = 0;
+
+       local_irq_save(flags);
+       latest_ns = this_cpu_read(cpu_softirq_time);
+       if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+               ret = 1;
+       local_irq_restore(flags);
+       return ret;
+}
+
  #else /* CONFIG_IRQ_TIME_ACCOUNTING */
  
+#define sched_clock_irqtime    (0)
+
  static void update_rq_clock_task(struct rq *rq, s64 delta)
  {
         rq->clock_task += delta;
@@ -2022,14 +2087,14 @@ inline int task_curr(const struct task_struct *p)
  
  static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                        const struct sched_class *prev_class,
-                                      int oldprio, int running)
+                                      int oldprio)
  {
         if (prev_class != p->sched_class) {
                 if (prev_class->switched_from)
-                       prev_class->switched_from(rq, p, running);
-               p->sched_class->switched_to(rq, p, running);
-       } else
-               p->sched_class->prio_changed(rq, p, oldprio, running);
+                       prev_class->switched_from(rq, p);
+               p->sched_class->switched_to(rq, p);
+       } else if (oldprio != p->prio)
+               p->sched_class->prio_changed(rq, p, oldprio);
  }
  
  static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2542,6 +2607,7 @@ static void __sched_fork(struct task_struct *p)
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
+       p->se.vruntime                  = 0;
  
  #ifdef CONFIG_SCHEDSTATS
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -3546,6 +3612,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
         }
  }
  
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                       cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+       cputime64_t tmp = cputime_to_cputime64(cputime);
+
+       /* Add system time to process. */
+       p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+       account_group_system_time(p, cputime);
+
+       /* Add system time to cpustat. */
+       *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+       /* Account for system time used */
+       acct_update_integrals(p);
+}
+
  /*
   * Account system cpu time to a process.
   * @p: the process that the cpu time gets accounted to
@@ -3557,36 +3649,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                          cputime_t cputime, cputime_t cputime_scaled)
  {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp;
+       cputime64_t *target_cputime64;
  
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                 account_guest_time(p, cputime, cputime_scaled);
                 return;
         }
  
-       /* Add system time to process. */
-       p->stime = cputime_add(p->stime, cputime);
-       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-       account_group_system_time(p, cputime);
-
-       /* Add system time to cpustat. */
-       tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
-               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+               target_cputime64 = &cpustat->irq;
         else if (in_serving_softirq())
-               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+               target_cputime64 = &cpustat->softirq;
         else
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-
-       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+               target_cputime64 = &cpustat->system;
  
-       /* Account for system time used */
-       acct_update_integrals(p);
+       __account_system_time(p, cputime, cputime_scaled, target_cputime64);
  }
  
  /*
   * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
   */
  void account_steal_time(cputime_t cputime)
  {
@@ -3614,6 +3696,73 @@ void account_idle_time(cputime_t cputime)
  
  #ifndef CONFIG_VIRT_CPU_ACCOUNTING
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq)
+{
+       cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+       cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+       if (irqtime_account_hi_update()) {
+               cpustat->irq = cputime64_add(cpustat->irq, tmp);
+       } else if (irqtime_account_si_update()) {
+               cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+       } else if (this_cpu_ksoftirqd() == p) {
+               /*
+                * ksoftirqd time do not get accounted in cpu_softirq_time.
+                * So, we have to handle it separately here.
+                * Also, p->stime needs to be updated for ksoftirqd.
+                */
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->softirq);
+       } else if (user_tick) {
+               account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else if (p == rq->idle) {
+               account_idle_time(cputime_one_jiffy);
+       } else if (p->flags & PF_VCPU) { /* System time or guest time */
+               account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+       } else {
+               __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                       &cpustat->system);
+       }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+       int i;
+       struct rq *rq = this_rq();
+
+       for (i = 0; i < ticks; i++)
+               irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                               struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
  /*
   * Account a single tick of cpu time.
   * @p: the process that the cpu time gets accounted to
@@ -3624,6 +3773,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
         struct rq *rq = this_rq();
  
+       if (sched_clock_irqtime) {
+               irqtime_account_process_tick(p, user_tick, rq);
+               return;
+       }
+
         if (user_tick)
                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3649,6 +3803,12 @@ void account_steal_ticks(unsigned long ticks)
   */
  void account_idle_ticks(unsigned long ticks)
  {
+
+       if (sched_clock_irqtime) {
+               irqtime_account_idle_ticks(ticks);
+               return;
+       }
+
         account_idle_time(jiffies_to_cputime(ticks));
  }
  
@@ -4189,6 +4349,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
  {
         __wake_up_common(q, mode, 1, 0, key);
  }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
  
  /**
   * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4546,11 +4707,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         if (running)
                 p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
  
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
+       check_class_changed(rq, p, prev_class, oldprio);
         task_rq_unlock(rq, &flags);
  }
  
@@ -4798,12 +4958,15 @@ recheck:
                             param->sched_priority > rlim_rtprio)
                                 return -EPERM;
                 }
+
                 /*
-                * Like positive nice levels, dont allow tasks to
-                * move out of SCHED_IDLE either:
+                * Treat SCHED_IDLE as nice 20. Only allow a switch to
+                * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                  */
-               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
-                       return -EPERM;
+               if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+                       if (!can_nice(p, TASK_NICE(p)))
+                               return -EPERM;
+               }
  
                 /* can't change other user's priorities */
                 if (!check_same_owner(p))
@@ -4878,11 +5041,10 @@ recheck:
  
         if (running)
                 p->sched_class->set_curr_task(rq);
-       if (on_rq) {
+       if (on_rq)
                 activate_task(rq, p, 0);
  
-               check_class_changed(rq, p, prev_class, oldprio, running);
-       }
+       check_class_changed(rq, p, prev_class, oldprio);
         __task_rq_unlock(rq);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
@@ -5299,6 +5461,65 @@ void __sched yield(void)
  }
  EXPORT_SYMBOL(yield);
  
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+       struct task_struct *curr = current;
+       struct rq *rq, *p_rq;
+       unsigned long flags;
+       bool yielded = 0;
+
+       local_irq_save(flags);
+       rq = this_rq();
+
+again:
+       p_rq = task_rq(p);
+       double_rq_lock(rq, p_rq);
+       while (task_rq(p) != p_rq) {
+               double_rq_unlock(rq, p_rq);
+               goto again;
+       }
+
+       if (!curr->sched_class->yield_to_task)
+               goto out;
+
+       if (curr->sched_class != p->sched_class)
+               goto out;
+
+       if (task_running(p_rq, p) || p->state)
+               goto out;
+
+       yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+       if (yielded) {
+               schedstat_inc(rq, yld_count);
+               /*
+                * Make p's CPU reschedule; pick_next_entity takes care of
+                * fairness.
+                */
+               if (preempt && rq != p_rq)
+                       resched_task(p_rq->curr);
+       }
+
+out:
+       double_rq_unlock(rq, p_rq);
+       local_irq_restore(flags);
+
+       if (yielded)
+               schedule();
+
+       return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
  /*
   * This task is about to go to sleep on IO. Increment rq->nr_iowait so
   * that process accounting knows that this is a task in IO wait state.
@@ -7772,6 +7993,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
         INIT_LIST_HEAD(&cfs_rq->tasks);
  #ifdef CONFIG_FAIR_GROUP_SCHED
         cfs_rq->rq = rq;
+       /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+       cfs_rq->load_stamp = 1;
+#endif
  #endif
         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
  }
@@ -8085,6 +8310,8 @@ EXPORT_SYMBOL(__might_sleep);
  #ifdef CONFIG_MAGIC_SYSRQ
  static void normalize_task(struct rq *rq, struct task_struct *p)
  {
+       const struct sched_class *prev_class = p->sched_class;
+       int old_prio = p->prio;
         int on_rq;
  
         on_rq = p->se.on_rq;
@@ -8095,6 +8322,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                 activate_task(rq, p, 0);
                 resched_task(rq->curr);
         }
+
+       check_class_changed(rq, p, prev_class, old_prio);
  }
  
  void normalize_rt_tasks(void)
@@ -8486,7 +8715,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                 /* Propagate contribution to hierarchy */
                 raw_spin_lock_irqsave(&rq->lock, flags);
                 for_each_sched_entity(se)
-                       update_cfs_shares(group_cfs_rq(se), 0);
+                       update_cfs_shares(group_cfs_rq(se));
                 raw_spin_unlock_irqrestore(&rq->lock, flags);
         }