Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[~andy/linux] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index a88f4a485c5e5f92190dd5bf784600d79d5f8f18..656cd70eb577a4bf6755ed1e7d653864d0d04f96 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
   */
  int sysctl_sched_rt_runtime = 950000;
  
-
-
  /*
   * __task_rq_lock - lock the rq @p resides on.
   */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
  {
         int prio;
  
-       if (task_has_rt_policy(p))
+       if (task_has_dl_policy(p))
+               prio = MAX_DL_PRIO-1;
+       else if (task_has_rt_policy(p))
                 prio = MAX_RT_PRIO-1 - p->rt_priority;
         else
                 prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                 if (prev_class->switched_from)
                         prev_class->switched_from(rq, p);
                 p->sched_class->switched_to(rq, p);
-       } else if (oldprio != p->prio)
+       } else if (oldprio != p->prio || dl_task(p))
                 p->sched_class->prio_changed(rq, p, oldprio);
  }
  
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
         if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
                 goto out;
  
+       trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
         ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
  
  out:
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void)
          * TIF_NEED_RESCHED remotely (for the first time) will also send
          * this IPI.
          */
-       if (tif_need_resched())
-               set_preempt_need_resched();
+       preempt_fold_need_resched();
  
         if (llist_empty(&this_rq()->wake_list)
                         && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
  #endif
  
+       RB_CLEAR_NODE(&p->dl.rb_node);
+       hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+       p->dl.dl_runtime = p->dl.runtime = 0;
+       p->dl.dl_deadline = p->dl.deadline = 0;
+       p->dl.dl_period = 0;
+       p->dl.flags = 0;
+
         INIT_LIST_HEAD(&p->rt.run_list);
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled)
         numabalancing_enabled = enabled;
  }
  #endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_NUMA_BALANCING */
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct ctl_table t;
+       int err;
+       int state = numabalancing_enabled;
+
+       if (write && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       t = *table;
+       t.data = &state;
+       err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+       if (err < 0)
+               return err;
+       if (write)
+               set_numabalancing_state(state);
+       return err;
+}
+#endif
+#endif
  
  /*
   * fork()/clone()-time setup:
   */
-void sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
         unsigned long flags;
         int cpu = get_cpu();
@@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
          * Revert to default priority/policy on fork if requested.
          */
         if (unlikely(p->sched_reset_on_fork)) {
-               if (task_has_rt_policy(p)) {
+               if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                         p->policy = SCHED_NORMAL;
                         p->static_prio = NICE_TO_PRIO(0);
                         p->rt_priority = 0;
@@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
                 p->sched_reset_on_fork = 0;
         }
  
-       if (!rt_prio(p->prio))
+       if (dl_prio(p->prio)) {
+               put_cpu();
+               return -EAGAIN;
+       } else if (rt_prio(p->prio)) {
+               p->sched_class = &rt_sched_class;
+       } else {
                 p->sched_class = &fair_sched_class;
+       }
  
         if (p->sched_class->task_fork)
                 p->sched_class->task_fork(p);
@@ -1834,11 +1869,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
         init_task_preempt_count(p);
  #ifdef CONFIG_SMP
         plist_node_init(&p->pushable_tasks, MAX_PRIO);
+       RB_CLEAR_NODE(&p->pushable_dl_tasks);
  #endif
  
         put_cpu();
+       return 0;
+}
+
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+       if (runtime == RUNTIME_INF)
+               return 1ULL << 20;
+
+       /*
+        * Doing this here saves a lot of checks in all
+        * the calling paths, and returning zero seems
+        * safe for them anyway.
+        */
+       if (period == 0)
+               return 0;
+
+       return div64_u64(runtime << 20, period);
+}
+
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+       return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+       struct root_domain *rd = cpu_rq(i)->rd;
+       int cpus = 0;
+
+       for_each_cpu_and(i, rd->span, cpu_active_mask)
+               cpus++;
+
+       return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+       return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int dl_bw_cpus(int i)
+{
+       return 1;
+}
+#endif
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+       return dl_b->bw != -1 &&
+              dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
  }
  
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+                      const struct sched_attr *attr)
+{
+
+       struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+       u64 period = attr->sched_period;
+       u64 runtime = attr->sched_runtime;
+       u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+       int cpus, err = -1;
+
+       if (new_bw == p->dl.dl_bw)
+               return 0;
+
+       /*
+        * Either if a task, enters, leave, or stays -deadline but changes
+        * its parameters, we may need to update accordingly the total
+        * allocated bandwidth of the container.
+        */
+       raw_spin_lock(&dl_b->lock);
+       cpus = dl_bw_cpus(task_cpu(p));
+       if (dl_policy(policy) && !task_has_dl_policy(p) &&
+           !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+                  !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               err = 0;
+       }
+       raw_spin_unlock(&dl_b->lock);
+
+       return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
  /*
   * wake_up_new_task - wake up a newly created task for the first time.
   *
@@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         if (unlikely(prev_state == TASK_DEAD)) {
                 task_numa_free(prev);
  
+               if (prev->sched_class->task_dead)
+                       prev->sched_class->task_dead(prev);
+
                 /*
                  * Remove function-return probe instances associated with this
                  * task and put them back on the free list.
@@ -2296,7 +2447,7 @@ void scheduler_tick(void)
  
  #ifdef CONFIG_SMP
         rq->idle_balance = idle_cpu(cpu);
-       trigger_load_balance(rq, cpu);
+       trigger_load_balance(rq);
  #endif
         rq_last_tick_reset(rq);
  }
@@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev)
  {
         /*
          * Test if we are atomic. Since do_exit() needs to call into
-        * schedule() atomically, we ignore that path for now.
-        * Otherwise, whine if we are scheduling when we should not be.
+        * schedule() atomically, we ignore that path. Otherwise whine
+        * if we are scheduling when we should not.
          */
-       if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+       if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
                 __schedule_bug(prev);
         rcu_sleep_check();
  
@@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, on_rq, running;
+       int oldprio, on_rq, running, enqueue_flag = 0;
         struct rq *rq;
         const struct sched_class *prev_class;
  
-       BUG_ON(prio < 0 || prio > MAX_PRIO);
+       BUG_ON(prio > MAX_PRIO);
  
         rq = __task_rq_lock(p);
  
@@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         }
  
         trace_sched_pi_setprio(p, prio);
+       p->pi_top_task = rt_mutex_get_top_task(p);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->on_rq;
@@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         if (running)
                 p->sched_class->put_prev_task(rq, p);
  
-       if (rt_prio(prio))
+       /*
+        * Boosting condition are:
+        * 1. -rt task is running and holds mutex A
+        *      --> -dl task blocks on mutex A
+        *
+        * 2. -dl task is running and holds mutex A
+        *      --> -dl task blocks on mutex A and could preempt the
+        *          running task
+        */
+       if (dl_prio(prio)) {
+               if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+                       dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+                       p->dl.dl_boosted = 1;
+                       p->dl.dl_throttled = 0;
+                       enqueue_flag = ENQUEUE_REPLENISH;
+               } else
+                       p->dl.dl_boosted = 0;
+               p->sched_class = &dl_sched_class;
+       } else if (rt_prio(prio)) {
+               if (dl_prio(oldprio))
+                       p->dl.dl_boosted = 0;
+               if (oldprio < prio)
+                       enqueue_flag = ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
-       else
+       } else {
+               if (dl_prio(oldprio))
+                       p->dl.dl_boosted = 0;
                 p->sched_class = &fair_sched_class;
+       }
  
         p->prio = prio;
  
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq)
-               enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+               enqueue_task(rq, p, enqueue_flag);
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
         __task_rq_unlock(rq);
  }
  #endif
+
  void set_user_nice(struct task_struct *p, long nice)
  {
         int old_prio, delta, on_rq;
@@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice)
          * The RT priorities are set via sched_setscheduler(), but we still
          * allow the 'normal' nice value to be set - but as expected
          * it wont have any effect on scheduling until the task is
-        * SCHED_FIFO/SCHED_RR:
+        * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
          */
-       if (task_has_rt_policy(p)) {
+       if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                 p->static_prio = NICE_TO_PRIO(nice);
                 goto out_unlock;
         }
@@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
         return pid ? find_task_by_vpid(pid) : current;
  }
  
-/* Actually do priority change: must hold rq lock. */
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
  static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       init_dl_task_timer(dl_se);
+       dl_se->dl_runtime = attr->sched_runtime;
+       dl_se->dl_deadline = attr->sched_deadline;
+       dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+       dl_se->flags = attr->sched_flags;
+       dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+       dl_se->dl_throttled = 0;
+       dl_se->dl_new = 1;
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+                          const struct sched_attr *attr)
  {
+       int policy = attr->sched_policy;
+
+       if (policy == -1) /* setparam */
+               policy = p->policy;
+
         p->policy = policy;
-       p->rt_priority = prio;
+
+       if (dl_policy(policy))
+               __setparam_dl(p, attr);
+       else if (fair_policy(policy))
+               p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+
+       /*
+        * __sched_setscheduler() ensures attr->sched_priority == 0 when
+        * !rt_policy. Always setting this ensures that things like
+        * getparam()/getattr() don't report silly values for !rt tasks.
+        */
+       p->rt_priority = attr->sched_priority;
+
         p->normal_prio = normal_prio(p);
-       /* we are holding p->pi_lock already */
         p->prio = rt_mutex_getprio(p);
-       if (rt_prio(p->prio))
+
+       if (dl_prio(p->prio))
+               p->sched_class = &dl_sched_class;
+       else if (rt_prio(p->prio))
                 p->sched_class = &rt_sched_class;
         else
                 p->sched_class = &fair_sched_class;
+
         set_load_weight(p);
  }
  
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       attr->sched_priority = p->rt_priority;
+       attr->sched_runtime = dl_se->dl_runtime;
+       attr->sched_deadline = dl_se->dl_deadline;
+       attr->sched_period = dl_se->dl_period;
+       attr->sched_flags = dl_se->flags;
+}
+
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution (1us); we
+ * check sched_runtime only since it is always the smaller one.
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+       return attr && attr->sched_deadline != 0 &&
+               (attr->sched_period == 0 ||
+               (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
+               (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
+               attr->sched_runtime >= (2 << (DL_SCALE - 1));
+}
+
  /*
   * check the target process has a UID that matches the current process's
   */
@@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p)
         return match;
  }
  
-static int __sched_setscheduler(struct task_struct *p, int policy,
-                               const struct sched_param *param, bool user)
+static int __sched_setscheduler(struct task_struct *p,
+                               const struct sched_attr *attr,
+                               bool user)
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
+       int policy = attr->sched_policy;
         unsigned long flags;
         const struct sched_class *prev_class;
         struct rq *rq;
@@ -3037,31 +3290,40 @@ recheck:
                 reset_on_fork = p->sched_reset_on_fork;
                 policy = oldpolicy = p->policy;
         } else {
-               reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
-               policy &= ~SCHED_RESET_ON_FORK;
+               reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
  
-               if (policy != SCHED_FIFO && policy != SCHED_RR &&
+               if (policy != SCHED_DEADLINE &&
+                               policy != SCHED_FIFO && policy != SCHED_RR &&
                                 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
                                 policy != SCHED_IDLE)
                         return -EINVAL;
         }
  
+       if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+               return -EINVAL;
+
         /*
          * Valid priorities for SCHED_FIFO and SCHED_RR are
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
          * SCHED_BATCH and SCHED_IDLE is 0.
          */
-       if (param->sched_priority < 0 ||
-           (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
-           (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
+       if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
+           (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
                 return -EINVAL;
-       if (rt_policy(policy) != (param->sched_priority != 0))
+       if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+           (rt_policy(policy) != (attr->sched_priority != 0)))
                 return -EINVAL;
  
         /*
          * Allow unprivileged RT tasks to decrease priority:
          */
         if (user && !capable(CAP_SYS_NICE)) {
+               if (fair_policy(policy)) {
+                       if (attr->sched_nice < TASK_NICE(p) &&
+                           !can_nice(p, attr->sched_nice))
+                               return -EPERM;
+               }
+
                 if (rt_policy(policy)) {
                         unsigned long rlim_rtprio =
                                         task_rlimit(p, RLIMIT_RTPRIO);
@@ -3071,8 +3333,8 @@ recheck:
                                 return -EPERM;
  
                         /* can't increase priority */
-                       if (param->sched_priority > p->rt_priority &&
-                           param->sched_priority > rlim_rtprio)
+                       if (attr->sched_priority > p->rt_priority &&
+                           attr->sched_priority > rlim_rtprio)
                                 return -EPERM;
                 }
  
@@ -3120,14 +3382,21 @@ recheck:
         /*
          * If not changing anything there's no need to proceed further:
          */
-       if (unlikely(policy == p->policy && (!rt_policy(policy) ||
-                       param->sched_priority == p->rt_priority))) {
+       if (unlikely(policy == p->policy)) {
+               if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                       goto change;
+               if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+                       goto change;
+               if (dl_policy(policy))
+                       goto change;
+
                 task_rq_unlock(rq, p, &flags);
                 return 0;
         }
+change:
  
-#ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
                 /*
                  * Do not allow realtime tasks into groups that have no runtime
                  * assigned.
@@ -3138,8 +3407,24 @@ recheck:
                         task_rq_unlock(rq, p, &flags);
                         return -EPERM;
                 }
-       }
  #endif
+#ifdef CONFIG_SMP
+               if (dl_bandwidth_enabled() && dl_policy(policy)) {
+                       cpumask_t *span = rq->rd->span;
+
+                       /*
+                        * Don't allow tasks with an affinity mask smaller than
+                        * the entire root_domain to become SCHED_DEADLINE. We
+                        * will also fail if there's no bandwidth available.
+                        */
+                       if (!cpumask_subset(span, &p->cpus_allowed) ||
+                           rq->rd->dl_bw.bw == 0) {
+                               task_rq_unlock(rq, p, &flags);
+                               return -EPERM;
+                       }
+               }
+#endif
+       }
  
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3147,6 +3432,17 @@ recheck:
                 task_rq_unlock(rq, p, &flags);
                 goto recheck;
         }
+
+       /*
+        * If setscheduling to SCHED_DEADLINE (or changing the parameters
+        * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+        * is available.
+        */
+       if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+               task_rq_unlock(rq, p, &flags);
+               return -EBUSY;
+       }
+
         on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
@@ -3158,7 +3454,7 @@ recheck:
  
         oldprio = p->prio;
         prev_class = p->sched_class;
-       __setscheduler(rq, p, policy, param->sched_priority);
+       __setscheduler(rq, p, attr);
  
         if (running)
                 p->sched_class->set_curr_task(rq);
@@ -3173,6 +3469,26 @@ recheck:
         return 0;
  }
  
+static int _sched_setscheduler(struct task_struct *p, int policy,
+                              const struct sched_param *param, bool check)
+{
+       struct sched_attr attr = {
+               .sched_policy   = policy,
+               .sched_priority = param->sched_priority,
+               .sched_nice     = PRIO_TO_NICE(p->static_prio),
+       };
+
+       /*
+        * Fixup the legacy SCHED_RESET_ON_FORK hack
+        */
+       if (policy & SCHED_RESET_ON_FORK) {
+               attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+               policy &= ~SCHED_RESET_ON_FORK;
+               attr.sched_policy = policy;
+       }
+
+       return __sched_setscheduler(p, &attr, check);
+}
  /**
   * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
   * @p: the task in question.
@@ -3186,10 +3502,16 @@ recheck:
  int sched_setscheduler(struct task_struct *p, int policy,
                        const struct sched_param *param)
  {
-       return __sched_setscheduler(p, policy, param, true);
+       return _sched_setscheduler(p, policy, param, true);
  }
  EXPORT_SYMBOL_GPL(sched_setscheduler);
  
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+       return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
  /**
   * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
   * @p: the task in question.
@@ -3206,7 +3528,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
  int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                                const struct sched_param *param)
  {
-       return __sched_setscheduler(p, policy, param, false);
+       return _sched_setscheduler(p, policy, param, false);
  }
  
  static int
@@ -3231,6 +3553,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
         return retval;
  }
  
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+                          struct sched_attr *attr)
+{
+       u32 size;
+       int ret;
+
+       if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+               return -EFAULT;
+
+       /*
+        * zero the full structure, so that a short copy will be nice.
+        */
+       memset(attr, 0, sizeof(*attr));
+
+       ret = get_user(size, &uattr->size);
+       if (ret)
+               return ret;
+
+       if (size > PAGE_SIZE)   /* silly large */
+               goto err_size;
+
+       if (!size)              /* abi compat */
+               size = SCHED_ATTR_SIZE_VER0;
+
+       if (size < SCHED_ATTR_SIZE_VER0)
+               goto err_size;
+
+       /*
+        * If we're handed a bigger struct than we know of,
+        * ensure all the unknown bits are 0 - i.e. new
+        * user-space does not rely on any kernel feature
+        * extensions we dont know about yet.
+        */
+       if (size > sizeof(*attr)) {
+               unsigned char __user *addr;
+               unsigned char __user *end;
+               unsigned char val;
+
+               addr = (void __user *)uattr + sizeof(*attr);
+               end  = (void __user *)uattr + size;
+
+               for (; addr < end; addr++) {
+                       ret = get_user(val, addr);
+                       if (ret)
+                               return ret;
+                       if (val)
+                               goto err_size;
+               }
+               size = sizeof(*attr);
+       }
+
+       ret = copy_from_user(attr, uattr, size);
+       if (ret)
+               return -EFAULT;
+
+       /*
+        * XXX: do we want to be lenient like existing syscalls; or do we want
+        * to be strict and return an error on out-of-bounds values?
+        */
+       attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+out:
+       return ret;
+
+err_size:
+       put_user(sizeof(*attr), &uattr->size);
+       ret = -E2BIG;
+       goto out;
+}
+
  /**
   * sys_sched_setscheduler - set/change the scheduler policy and RT priority
   * @pid: the pid in question.
@@ -3261,6 +3656,33 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
         return do_sched_setscheduler(pid, -1, param);
  }
  
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+{
+       struct sched_attr attr;
+       struct task_struct *p;
+       int retval;
+
+       if (!uattr || pid < 0)
+               return -EINVAL;
+
+       if (sched_copy_attr(uattr, &attr))
+               return -EFAULT;
+
+       rcu_read_lock();
+       retval = -ESRCH;
+       p = find_process_by_pid(pid);
+       if (p != NULL)
+               retval = sched_setattr(p, &attr);
+       rcu_read_unlock();
+
+       return retval;
+}
+
  /**
   * sys_sched_getscheduler - get the policy (scheduling class) of a thread
   * @pid: the pid in question.
@@ -3316,6 +3738,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
         if (retval)
                 goto out_unlock;
  
+       if (task_has_dl_policy(p)) {
+               retval = -EINVAL;
+               goto out_unlock;
+       }
         lp.sched_priority = p->rt_priority;
         rcu_read_unlock();
  
@@ -3331,6 +3757,96 @@ out_unlock:
         return retval;
  }
  
+static int sched_read_attr(struct sched_attr __user *uattr,
+                          struct sched_attr *attr,
+                          unsigned int usize)
+{
+       int ret;
+
+       if (!access_ok(VERIFY_WRITE, uattr, usize))
+               return -EFAULT;
+
+       /*
+        * If we're handed a smaller struct than we know of,
+        * ensure all the unknown bits are 0 - i.e. old
+        * user-space does not get uncomplete information.
+        */
+       if (usize < sizeof(*attr)) {
+               unsigned char *addr;
+               unsigned char *end;
+
+               addr = (void *)attr + usize;
+               end  = (void *)attr + sizeof(*attr);
+
+               for (; addr < end; addr++) {
+                       if (*addr)
+                               goto err_size;
+               }
+
+               attr->size = usize;
+       }
+
+       ret = copy_to_user(uattr, attr, usize);
+       if (ret)
+               return -EFAULT;
+
+out:
+       return ret;
+
+err_size:
+       ret = -E2BIG;
+       goto out;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ */
+SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+               unsigned int, size)
+{
+       struct sched_attr attr = {
+               .size = sizeof(struct sched_attr),
+       };
+       struct task_struct *p;
+       int retval;
+
+       if (!uattr || pid < 0 || size > PAGE_SIZE ||
+           size < SCHED_ATTR_SIZE_VER0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       p = find_process_by_pid(pid);
+       retval = -ESRCH;
+       if (!p)
+               goto out_unlock;
+
+       retval = security_task_getscheduler(p);
+       if (retval)
+               goto out_unlock;
+
+       attr.sched_policy = p->policy;
+       if (p->sched_reset_on_fork)
+               attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+       if (task_has_dl_policy(p))
+               __getparam_dl(p, &attr);
+       else if (task_has_rt_policy(p))
+               attr.sched_priority = p->rt_priority;
+       else
+               attr.sched_nice = TASK_NICE(p);
+
+       rcu_read_unlock();
+
+       retval = sched_read_attr(uattr, &attr, size);
+       return retval;
+
+out_unlock:
+       rcu_read_unlock();
+       return retval;
+}
+
  long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  {
         cpumask_var_t cpus_allowed, new_mask;
@@ -3375,8 +3891,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         if (retval)
                 goto out_unlock;
  
+
         cpuset_cpus_allowed(p, cpus_allowed);
         cpumask_and(new_mask, in_mask, cpus_allowed);
+
+       /*
+        * Since bandwidth control happens on root_domain basis,
+        * if admission test is enabled, we only admit -deadline
+        * tasks allowed to run on all the CPUs in the task's
+        * root_domain.
+        */
+#ifdef CONFIG_SMP
+       if (task_has_dl_policy(p)) {
+               const struct cpumask *span = task_rq(p)->rd->span;
+
+               if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+                       retval = -EBUSY;
+                       goto out_unlock;
+               }
+       }
+#endif
  again:
         retval = set_cpus_allowed_ptr(p, new_mask);
  
@@ -3653,7 +4187,7 @@ again:
         }
  
         double_rq_lock(rq, p_rq);
-       while (task_rq(p) != p_rq) {
+       if (task_rq(p) != p_rq) {
                 double_rq_unlock(rq, p_rq);
                 goto again;
         }
@@ -3742,6 +4276,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
         case SCHED_RR:
                 ret = MAX_USER_RT_PRIO-1;
                 break;
+       case SCHED_DEADLINE:
         case SCHED_NORMAL:
         case SCHED_BATCH:
         case SCHED_IDLE:
@@ -3768,6 +4303,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
         case SCHED_RR:
                 ret = 1;
                 break;
+       case SCHED_DEADLINE:
         case SCHED_NORMAL:
         case SCHED_BATCH:
         case SCHED_IDLE:
@@ -3811,7 +4347,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                 goto out_unlock;
  
         rq = task_rq_lock(p, &flags);
-       time_slice = p->sched_class->get_rr_interval(rq, p);
+       time_slice = 0;
+       if (p->sched_class->get_rr_interval)
+               time_slice = p->sched_class->get_rr_interval(rq, p);
         task_rq_unlock(rq, p, &flags);
  
         rcu_read_unlock();
@@ -4090,6 +4628,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  
         /* TODO: This is not properly updating schedstats */
  
+       trace_sched_move_numa(p, curr_cpu, target_cpu);
         return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
  }
  
@@ -4514,13 +5053,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
  static int sched_cpu_inactive(struct notifier_block *nfb,
                                         unsigned long action, void *hcpu)
  {
+       unsigned long flags;
+       long cpu = (long)hcpu;
+
         switch (action & ~CPU_TASKS_FROZEN) {
         case CPU_DOWN_PREPARE:
-               set_cpu_active((long)hcpu, false);
+               set_cpu_active(cpu, false);
+
+               /* explicitly allow suspend */
+               if (!(action & CPU_TASKS_FROZEN)) {
+                       struct dl_bw *dl_b = dl_bw_of(cpu);
+                       bool overflow;
+                       int cpus;
+
+                       raw_spin_lock_irqsave(&dl_b->lock, flags);
+                       cpus = dl_bw_cpus(cpu);
+                       overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                       raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+
+                       if (overflow)
+                               return notifier_from_errno(-EBUSY);
+               }
                 return NOTIFY_OK;
-       default:
-               return NOTIFY_DONE;
         }
+
+       return NOTIFY_DONE;
  }
  
  static int __init migration_init(void)
@@ -4739,6 +5296,8 @@ static void free_rootdomain(struct rcu_head *rcu)
         struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
  
         cpupri_cleanup(&rd->cpupri);
+       cpudl_cleanup(&rd->cpudl);
+       free_cpumask_var(rd->dlo_mask);
         free_cpumask_var(rd->rto_mask);
         free_cpumask_var(rd->online);
         free_cpumask_var(rd->span);
@@ -4790,8 +5349,14 @@ static int init_rootdomain(struct root_domain *rd)
                 goto out;
         if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                 goto free_span;
-       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+       if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
                 goto free_online;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_dlo_mask;
+
+       init_dl_bw(&rd->dl_bw);
+       if (cpudl_init(&rd->cpudl) != 0)
+               goto free_dlo_mask;
  
         if (cpupri_init(&rd->cpupri) != 0)
                 goto free_rto_mask;
@@ -4799,6 +5364,8 @@ static int init_rootdomain(struct root_domain *rd)
  
  free_rto_mask:
         free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+       free_cpumask_var(rd->dlo_mask);
  free_online:
         free_cpumask_var(rd->online);
  free_span:
@@ -6150,6 +6717,7 @@ void __init sched_init_smp(void)
         free_cpumask_var(non_isolated_cpus);
  
         init_sched_rt_class();
+       init_sched_dl_class();
  }
  #else
  void __init sched_init_smp(void)
@@ -6219,13 +6787,15 @@ void __init sched_init(void)
  #endif /* CONFIG_CPUMASK_OFFSTACK */
         }
  
+       init_rt_bandwidth(&def_rt_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+       init_dl_bandwidth(&def_dl_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+
  #ifdef CONFIG_SMP
         init_defrootdomain();
  #endif
  
-       init_rt_bandwidth(&def_rt_bandwidth,
-                       global_rt_period(), global_rt_runtime());
-
  #ifdef CONFIG_RT_GROUP_SCHED
         init_rt_bandwidth(&root_task_group.rt_bandwidth,
                         global_rt_period(), global_rt_runtime());
@@ -6249,6 +6819,7 @@ void __init sched_init(void)
                 rq->calc_load_update = jiffies + LOAD_FREQ;
                 init_cfs_rq(&rq->cfs);
                 init_rt_rq(&rq->rt, rq);
+               init_dl_rq(&rq->dl, rq);
  #ifdef CONFIG_FAIR_GROUP_SCHED
                 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6320,10 +6891,6 @@ void __init sched_init(void)
         INIT_HLIST_HEAD(&init_task.preempt_notifiers);
  #endif
  
-#ifdef CONFIG_RT_MUTEXES
-       plist_head_init(&init_task.pi_waiters);
-#endif
-
         /*
          * The boot idle thread does lazy MMU switching as well:
          */
@@ -6397,13 +6964,16 @@ EXPORT_SYMBOL(__might_sleep);
  static void normalize_task(struct rq *rq, struct task_struct *p)
  {
         const struct sched_class *prev_class = p->sched_class;
+       struct sched_attr attr = {
+               .sched_policy = SCHED_NORMAL,
+       };
         int old_prio = p->prio;
         int on_rq;
  
         on_rq = p->on_rq;
         if (on_rq)
                 dequeue_task(rq, p, 0);
-       __setscheduler(rq, p, SCHED_NORMAL, 0);
+       __setscheduler(rq, p, &attr);
         if (on_rq) {
                 enqueue_task(rq, p, 0);
                 resched_task(rq->curr);
@@ -6433,7 +7003,7 @@ void normalize_rt_tasks(void)
                 p->se.statistics.block_start    = 0;
  #endif
  
-               if (!rt_task(p)) {
+               if (!dl_task(p) && !rt_task(p)) {
                         /*
                          * Renice negative nice level userspace
                          * tasks back to 0:
@@ -6628,16 +7198,6 @@ void sched_move_task(struct task_struct *tsk)
  }
  #endif /* CONFIG_CGROUP_SCHED */
  
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-       if (runtime == RUNTIME_INF)
-               return 1ULL << 20;
-
-       return div64_u64(runtime << 20, period);
-}
-#endif
-
  #ifdef CONFIG_RT_GROUP_SCHED
  /*
   * Ensure that the real time constraints are schedulable.
@@ -6811,24 +7371,13 @@ static long sched_group_rt_period(struct task_group *tg)
         do_div(rt_period_us, NSEC_PER_USEC);
         return rt_period_us;
  }
+#endif /* CONFIG_RT_GROUP_SCHED */
  
+#ifdef CONFIG_RT_GROUP_SCHED
  static int sched_rt_global_constraints(void)
  {
-       u64 runtime, period;
         int ret = 0;
  
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       runtime = global_rt_runtime();
-       period = global_rt_period();
-
-       /*
-        * Sanity check on the sysctl variables.
-        */
-       if (runtime > period && runtime != RUNTIME_INF)
-               return -EINVAL;
-
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
         ret = __rt_schedulable(NULL, 0, 0);
@@ -6851,17 +7400,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
  static int sched_rt_global_constraints(void)
  {
         unsigned long flags;
-       int i;
-
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       /*
-        * There's always some RT tasks in the root group
-        * -- migration, kstopmachine etc..
-        */
-       if (sysctl_sched_rt_runtime == 0)
-               return -EBUSY;
+       int i, ret = 0;
  
         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
@@ -6873,36 +7412,88 @@ static int sched_rt_global_constraints(void)
         }
         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
  
-       return 0;
+       return ret;
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
-int sched_rr_handler(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp,
-               loff_t *ppos)
+static int sched_dl_global_constraints(void)
  {
-       int ret;
-       static DEFINE_MUTEX(mutex);
+       u64 runtime = global_rt_runtime();
+       u64 period = global_rt_period();
+       u64 new_bw = to_ratio(period, runtime);
+       int cpu, ret = 0;
  
-       mutex_lock(&mutex);
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
-       /* make sure that internally we keep jiffies */
-       /* also, writing zero resets timeslice to default */
-       if (!ret && write) {
-               sched_rr_timeslice = sched_rr_timeslice <= 0 ?
-                       RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+       /*
+        * Here we want to check the bandwidth not being set to some
+        * value smaller than the currently allocated bandwidth in
+        * any of the root_domains.
+        *
+        * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+        * cycling on root_domains... Discussion on different/better
+        * solutions is welcome!
+        */
+       for_each_possible_cpu(cpu) {
+               struct dl_bw *dl_b = dl_bw_of(cpu);
+
+               raw_spin_lock(&dl_b->lock);
+               if (new_bw < dl_b->total_bw)
+                       ret = -EBUSY;
+               raw_spin_unlock(&dl_b->lock);
+
+               if (ret)
+                       break;
         }
-       mutex_unlock(&mutex);
+
         return ret;
  }
  
+static void sched_dl_do_global(void)
+{
+       u64 new_bw = -1;
+       int cpu;
+
+       def_dl_bandwidth.dl_period = global_rt_period();
+       def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+       if (global_rt_runtime() != RUNTIME_INF)
+               new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+       /*
+        * FIXME: As above...
+        */
+       for_each_possible_cpu(cpu) {
+               struct dl_bw *dl_b = dl_bw_of(cpu);
+
+               raw_spin_lock(&dl_b->lock);
+               dl_b->bw = new_bw;
+               raw_spin_unlock(&dl_b->lock);
+       }
+}
+
+static int sched_rt_global_validate(void)
+{
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+
+       if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+               return -EINVAL;
+
+       return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+       def_rt_bandwidth.rt_runtime = global_rt_runtime();
+       def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
  int sched_rt_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos)
  {
-       int ret;
         int old_period, old_runtime;
         static DEFINE_MUTEX(mutex);
+       int ret;
  
         mutex_lock(&mutex);
         old_period = sysctl_sched_rt_period;
@@ -6911,21 +7502,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
         ret = proc_dointvec(table, write, buffer, lenp, ppos);
  
         if (!ret && write) {
+               ret = sched_rt_global_validate();
+               if (ret)
+                       goto undo;
+
                 ret = sched_rt_global_constraints();
-               if (ret) {
-                       sysctl_sched_rt_period = old_period;
-                       sysctl_sched_rt_runtime = old_runtime;
-               } else {
-                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
-                       def_rt_bandwidth.rt_period =
-                               ns_to_ktime(global_rt_period());
-               }
+               if (ret)
+                       goto undo;
+
+               ret = sched_dl_global_constraints();
+               if (ret)
+                       goto undo;
+
+               sched_rt_do_global();
+               sched_dl_do_global();
+       }
+       if (0) {
+undo:
+               sysctl_sched_rt_period = old_period;
+               sysctl_sched_rt_runtime = old_runtime;
         }
         mutex_unlock(&mutex);
  
         return ret;
  }
  
+int sched_rr_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+       static DEFINE_MUTEX(mutex);
+
+       mutex_lock(&mutex);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       /* make sure that internally we keep jiffies */
+       /* also, writing zero resets timeslice to default */
+       if (!ret && write) {
+               sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                       RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+       }
+       mutex_unlock(&mutex);
+       return ret;
+}
+
  #ifdef CONFIG_CGROUP_SCHED
  
  static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7258,15 +7878,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
         return ret;
  }
  
-static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
-               struct cgroup_map_cb *cb)
+static int cpu_stats_show(struct seq_file *sf, void *v)
  {
-       struct task_group *tg = css_tg(css);
+       struct task_group *tg = css_tg(seq_css(sf));
         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
  
-       cb->fill(cb, "nr_periods", cfs_b->nr_periods);
-       cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
-       cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+       seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
+       seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
+       seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
  
         return 0;
  }
@@ -7320,7 +7939,7 @@ static struct cftype cpu_files[] = {
         },
         {
                 .name = "stat",
-               .read_map = cpu_stats_show,
+               .seq_show = cpu_stats_show,
         },
  #endif
  #ifdef CONFIG_RT_GROUP_SCHED