sched/deadline: Remove the sysctl_sched_dl knobs

[~andy/linux] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index aebcc70b5c93f79174a5885f26e0208f40240197..1d33eb8143cc0ff3098ac80c6860524e06518c6d 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,6 +296,15 @@ __read_mostly int scheduler_running;
   */
  int sysctl_sched_rt_runtime = 950000;
  
+/*
+ * Maximum bandwidth available for all -deadline tasks and groups
+ * (if group scheduling is configured) on each CPU.
+ *
+ * default: 5%
+ */
+unsigned int sysctl_sched_dl_period = 1000000;
+int sysctl_sched_dl_runtime = 50000;
+
  
  
  /*
@@ -947,7 +956,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                 if (prev_class->switched_from)
                         prev_class->switched_from(rq, p);
                 p->sched_class->switched_to(rq, p);
-       } else if (oldprio != p->prio)
+       } else if (oldprio != p->prio || dl_task(p))
                 p->sched_class->prio_changed(rq, p, oldprio);
  }
  
@@ -1856,6 +1865,111 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         return 0;
  }
  
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+       if (runtime == RUNTIME_INF)
+               return 1ULL << 20;
+
+       /*
+        * Doing this here saves a lot of checks in all
+        * the calling paths, and returning zero seems
+        * safe for them anyway.
+        */
+       if (period == 0)
+               return 0;
+
+       return div64_u64(runtime << 20, period);
+}
+
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+       return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int __dl_span_weight(struct rq *rq)
+{
+       return cpumask_weight(rq->rd->span);
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+       return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int __dl_span_weight(struct rq *rq)
+{
+       return 1;
+}
+#endif
+
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+       dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+       return dl_b->bw != -1 &&
+              dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+                      const struct sched_attr *attr)
+{
+
+       struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+       u64 period = attr->sched_period;
+       u64 runtime = attr->sched_runtime;
+       u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+       int cpus = __dl_span_weight(task_rq(p));
+       int err = -1;
+
+       if (new_bw == p->dl.dl_bw)
+               return 0;
+
+       /*
+        * Either if a task, enters, leave, or stays -deadline but changes
+        * its parameters, we may need to update accordingly the total
+        * allocated bandwidth of the container.
+        */
+       raw_spin_lock(&dl_b->lock);
+       if (dl_policy(policy) && !task_has_dl_policy(p) &&
+           !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+                  !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               __dl_add(dl_b, new_bw);
+               err = 0;
+       } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+               __dl_clear(dl_b, p->dl.dl_bw);
+               err = 0;
+       }
+       raw_spin_unlock(&dl_b->lock);
+
+       return err;
+}
+
+extern void init_dl_bw(struct dl_bw *dl_b);
+
  /*
   * wake_up_new_task - wake up a newly created task for the first time.
   *
@@ -2781,7 +2895,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
   */
  void rt_mutex_setprio(struct task_struct *p, int prio)
  {
-       int oldprio, on_rq, running;
+       int oldprio, on_rq, running, enqueue_flag = 0;
         struct rq *rq;
         const struct sched_class *prev_class;
  
@@ -2808,6 +2922,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         }
  
         trace_sched_pi_setprio(p, prio);
+       p->pi_top_task = rt_mutex_get_top_task(p);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->on_rq;
@@ -2817,19 +2932,42 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         if (running)
                 p->sched_class->put_prev_task(rq, p);
  
-       if (dl_prio(prio))
+       /*
+        * Boosting condition are:
+        * 1. -rt task is running and holds mutex A
+        *      --> -dl task blocks on mutex A
+        *
+        * 2. -dl task is running and holds mutex A
+        *      --> -dl task blocks on mutex A and could preempt the
+        *          running task
+        */
+       if (dl_prio(prio)) {
+               if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+                       dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+                       p->dl.dl_boosted = 1;
+                       p->dl.dl_throttled = 0;
+                       enqueue_flag = ENQUEUE_REPLENISH;
+               } else
+                       p->dl.dl_boosted = 0;
                 p->sched_class = &dl_sched_class;
-       else if (rt_prio(prio))
+       } else if (rt_prio(prio)) {
+               if (dl_prio(oldprio))
+                       p->dl.dl_boosted = 0;
+               if (oldprio < prio)
+                       enqueue_flag = ENQUEUE_HEAD;
                 p->sched_class = &rt_sched_class;
-       else
+       } else {
+               if (dl_prio(oldprio))
+                       p->dl.dl_boosted = 0;
                 p->sched_class = &fair_sched_class;
+       }
  
         p->prio = prio;
  
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq)
-               enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+               enqueue_task(rq, p, enqueue_flag);
  
         check_class_changed(rq, p, prev_class, oldprio);
  out_unlock:
@@ -3029,6 +3167,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
         dl_se->dl_deadline = attr->sched_deadline;
         dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
         dl_se->flags = attr->sched_flags;
+       dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
         dl_se->dl_throttled = 0;
         dl_se->dl_new = 1;
  }
@@ -3077,7 +3216,9 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
   * This function validates the new parameters of a -deadline task.
   * We ask for the deadline not being zero, and greater or equal
   * than the runtime, as well as the period of being zero or
- * greater than deadline.
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution (1us); we
+ * check sched_runtime only since it is always the smaller one.
   */
  static bool
  __checkparam_dl(const struct sched_attr *attr)
@@ -3085,7 +3226,8 @@ __checkparam_dl(const struct sched_attr *attr)
         return attr && attr->sched_deadline != 0 &&
                 (attr->sched_period == 0 ||
                 (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
-               (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0;
+               (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
+               attr->sched_runtime >= (2 << (DL_SCALE - 1));
  }
  
  /*
@@ -3226,8 +3368,8 @@ recheck:
         }
  change:
  
-#ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
                 /*
                  * Do not allow realtime tasks into groups that have no runtime
                  * assigned.
@@ -3238,8 +3380,24 @@ change:
                         task_rq_unlock(rq, p, &flags);
                         return -EPERM;
                 }
-       }
  #endif
+#ifdef CONFIG_SMP
+               if (dl_bandwidth_enabled() && dl_policy(policy)) {
+                       cpumask_t *span = rq->rd->span;
+
+                       /*
+                        * Don't allow tasks with an affinity mask smaller than
+                        * the entire root_domain to become SCHED_DEADLINE. We
+                        * will also fail if there's no bandwidth available.
+                        */
+                       if (!cpumask_subset(span, &p->cpus_allowed) ||
+                           rq->rd->dl_bw.bw == 0) {
+                               task_rq_unlock(rq, p, &flags);
+                               return -EPERM;
+                       }
+               }
+#endif
+       }
  
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3247,6 +3405,17 @@ change:
                 task_rq_unlock(rq, p, &flags);
                 goto recheck;
         }
+
+       /*
+        * If setscheduling to SCHED_DEADLINE (or changing the parameters
+        * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+        * is available.
+        */
+       if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+               task_rq_unlock(rq, p, &flags);
+               return -EBUSY;
+       }
+
         on_rq = p->on_rq;
         running = task_current(rq, p);
         if (on_rq)
@@ -3681,8 +3850,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         if (retval)
                 goto out_unlock;
  
+
         cpuset_cpus_allowed(p, cpus_allowed);
         cpumask_and(new_mask, in_mask, cpus_allowed);
+
+       /*
+        * Since bandwidth control happens on root_domain basis,
+        * if admission test is enabled, we only admit -deadline
+        * tasks allowed to run on all the CPUs in the task's
+        * root_domain.
+        */
+#ifdef CONFIG_SMP
+       if (task_has_dl_policy(p)) {
+               const struct cpumask *span = task_rq(p)->rd->span;
+
+               if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+                       retval = -EBUSY;
+                       goto out_unlock;
+               }
+       }
+#endif
  again:
         retval = set_cpus_allowed_ptr(p, new_mask);
  
@@ -4334,6 +4521,42 @@ out:
  }
  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  
+/*
+ * When dealing with a -deadline task, we have to check if moving it to
+ * a new CPU is possible or not. In fact, this is only true iff there
+ * is enough bandwidth available on such CPU, otherwise we want the
+ * whole migration procedure to fail over.
+ */
+static inline
+bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu)
+{
+       struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+       struct dl_bw *cpu_b = dl_bw_of(cpu);
+       int ret = 1;
+       u64 bw;
+
+       if (dl_b == cpu_b)
+               return 1;
+
+       raw_spin_lock(&dl_b->lock);
+       raw_spin_lock(&cpu_b->lock);
+
+       bw = cpu_b->bw * cpumask_weight(cpu_rq(cpu)->rd->span);
+       if (dl_bandwidth_enabled() &&
+           bw < cpu_b->total_bw + p->dl.dl_bw) {
+               ret = 0;
+               goto unlock;
+       }
+       dl_b->total_bw -= p->dl.dl_bw;
+       cpu_b->total_bw += p->dl.dl_bw;
+
+unlock:
+       raw_spin_unlock(&cpu_b->lock);
+       raw_spin_unlock(&dl_b->lock);
+
+       return ret;
+}
+
  /*
   * Move (not current) task off this cpu, onto dest cpu. We're doing
   * this because either it can't run here any more (set_cpus_allowed()
@@ -4365,6 +4588,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                 goto fail;
  
+       /*
+        * If p is -deadline, proceed only if there is enough
+        * bandwidth available on dest_cpu
+        */
+       if (unlikely(dl_task(p)) && !set_task_cpu_dl(p, dest_cpu))
+               goto fail;
+
         /*
          * If we're not on a rq, the next wake-up will ensure we're
          * placed properly.
@@ -5047,6 +5277,7 @@ static void free_rootdomain(struct rcu_head *rcu)
         struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
  
         cpupri_cleanup(&rd->cpupri);
+       cpudl_cleanup(&rd->cpudl);
         free_cpumask_var(rd->dlo_mask);
         free_cpumask_var(rd->rto_mask);
         free_cpumask_var(rd->online);
@@ -5104,6 +5335,10 @@ static int init_rootdomain(struct root_domain *rd)
         if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                 goto free_dlo_mask;
  
+       init_dl_bw(&rd->dl_bw);
+       if (cpudl_init(&rd->cpudl) != 0)
+               goto free_dlo_mask;
+
         if (cpupri_init(&rd->cpupri) != 0)
                 goto free_rto_mask;
         return 0;
@@ -6533,13 +6768,15 @@ void __init sched_init(void)
  #endif /* CONFIG_CPUMASK_OFFSTACK */
         }
  
+       init_rt_bandwidth(&def_rt_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+       init_dl_bandwidth(&def_dl_bandwidth,
+                       global_rt_period(), global_rt_runtime());
+
  #ifdef CONFIG_SMP
         init_defrootdomain();
  #endif
  
-       init_rt_bandwidth(&def_rt_bandwidth,
-                       global_rt_period(), global_rt_runtime());
-
  #ifdef CONFIG_RT_GROUP_SCHED
         init_rt_bandwidth(&root_task_group.rt_bandwidth,
                         global_rt_period(), global_rt_runtime());
@@ -6942,16 +7179,6 @@ void sched_move_task(struct task_struct *tsk)
  }
  #endif /* CONFIG_CGROUP_SCHED */
  
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-       if (runtime == RUNTIME_INF)
-               return 1ULL << 20;
-
-       return div64_u64(runtime << 20, period);
-}
-#endif
-
  #ifdef CONFIG_RT_GROUP_SCHED
  /*
   * Ensure that the real time constraints are schedulable.
@@ -7125,24 +7352,13 @@ static long sched_group_rt_period(struct task_group *tg)
         do_div(rt_period_us, NSEC_PER_USEC);
         return rt_period_us;
  }
+#endif /* CONFIG_RT_GROUP_SCHED */
  
+#ifdef CONFIG_RT_GROUP_SCHED
  static int sched_rt_global_constraints(void)
  {
-       u64 runtime, period;
         int ret = 0;
  
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       runtime = global_rt_runtime();
-       period = global_rt_period();
-
-       /*
-        * Sanity check on the sysctl variables.
-        */
-       if (runtime > period && runtime != RUNTIME_INF)
-               return -EINVAL;
-
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
         ret = __rt_schedulable(NULL, 0, 0);
@@ -7165,17 +7381,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
  static int sched_rt_global_constraints(void)
  {
         unsigned long flags;
-       int i;
-
-       if (sysctl_sched_rt_period <= 0)
-               return -EINVAL;
-
-       /*
-        * There's always some RT tasks in the root group
-        * -- migration, kstopmachine etc..
-        */
-       if (sysctl_sched_rt_runtime == 0)
-               return -EBUSY;
+       int i, ret = 0;
  
         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
         for_each_possible_cpu(i) {
@@ -7187,36 +7393,88 @@ static int sched_rt_global_constraints(void)
         }
         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
  
-       return 0;
+       return ret;
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
-int sched_rr_handler(struct ctl_table *table, int write,
-               void __user *buffer, size_t *lenp,
-               loff_t *ppos)
+static int sched_dl_global_constraints(void)
  {
-       int ret;
-       static DEFINE_MUTEX(mutex);
+       u64 runtime = global_rt_runtime();
+       u64 period = global_rt_period();
+       u64 new_bw = to_ratio(period, runtime);
+       int cpu, ret = 0;
  
-       mutex_lock(&mutex);
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
-       /* make sure that internally we keep jiffies */
-       /* also, writing zero resets timeslice to default */
-       if (!ret && write) {
-               sched_rr_timeslice = sched_rr_timeslice <= 0 ?
-                       RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+       /*
+        * Here we want to check the bandwidth not being set to some
+        * value smaller than the currently allocated bandwidth in
+        * any of the root_domains.
+        *
+        * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+        * cycling on root_domains... Discussion on different/better
+        * solutions is welcome!
+        */
+       for_each_possible_cpu(cpu) {
+               struct dl_bw *dl_b = dl_bw_of(cpu);
+
+               raw_spin_lock(&dl_b->lock);
+               if (new_bw < dl_b->total_bw)
+                       ret = -EBUSY;
+               raw_spin_unlock(&dl_b->lock);
+
+               if (ret)
+                       break;
         }
-       mutex_unlock(&mutex);
+
         return ret;
  }
  
+static void sched_dl_do_global(void)
+{
+       u64 new_bw = -1;
+       int cpu;
+
+       def_dl_bandwidth.dl_period = global_rt_period();
+       def_dl_bandwidth.dl_runtime = global_rt_runtime();
+
+       if (global_rt_runtime() != RUNTIME_INF)
+               new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+
+       /*
+        * FIXME: As above...
+        */
+       for_each_possible_cpu(cpu) {
+               struct dl_bw *dl_b = dl_bw_of(cpu);
+
+               raw_spin_lock(&dl_b->lock);
+               dl_b->bw = new_bw;
+               raw_spin_unlock(&dl_b->lock);
+       }
+}
+
+static int sched_rt_global_validate(void)
+{
+       if (sysctl_sched_rt_period <= 0)
+               return -EINVAL;
+
+       if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+               return -EINVAL;
+
+       return 0;
+}
+
+static void sched_rt_do_global(void)
+{
+       def_rt_bandwidth.rt_runtime = global_rt_runtime();
+       def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
+
  int sched_rt_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos)
  {
-       int ret;
         int old_period, old_runtime;
         static DEFINE_MUTEX(mutex);
+       int ret;
  
         mutex_lock(&mutex);
         old_period = sysctl_sched_rt_period;
@@ -7225,21 +7483,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
         ret = proc_dointvec(table, write, buffer, lenp, ppos);
  
         if (!ret && write) {
+               ret = sched_rt_global_validate();
+               if (ret)
+                       goto undo;
+
                 ret = sched_rt_global_constraints();
-               if (ret) {
-                       sysctl_sched_rt_period = old_period;
-                       sysctl_sched_rt_runtime = old_runtime;
-               } else {
-                       def_rt_bandwidth.rt_runtime = global_rt_runtime();
-                       def_rt_bandwidth.rt_period =
-                               ns_to_ktime(global_rt_period());
-               }
+               if (ret)
+                       goto undo;
+
+               ret = sched_dl_global_constraints();
+               if (ret)
+                       goto undo;
+
+               sched_rt_do_global();
+               sched_dl_do_global();
+       }
+       if (0) {
+undo:
+               sysctl_sched_rt_period = old_period;
+               sysctl_sched_rt_runtime = old_runtime;
         }
         mutex_unlock(&mutex);
  
         return ret;
  }
  
+int sched_rr_handler(struct ctl_table *table, int write,
+               void __user *buffer, size_t *lenp,
+               loff_t *ppos)
+{
+       int ret;
+       static DEFINE_MUTEX(mutex);
+
+       mutex_lock(&mutex);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       /* make sure that internally we keep jiffies */
+       /* also, writing zero resets timeslice to default */
+       if (!ret && write) {
+               sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                       RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+       }
+       mutex_unlock(&mutex);
+       return ret;
+}
+
  #ifdef CONFIG_CGROUP_SCHED
  
  static inline struct task_group *css_tg(struct cgroup_subsys_state *css)