#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/seq_file.h>
+#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/tsacct_kern.h>
unsigned int clock_warps, clock_overflows;
unsigned int clock_unstable_events;
- struct sched_class *load_balance_class;
-
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+ unsigned long long now;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ now = rq_clock(cpu_rq(cpu));
+ local_irq_restore(flags);
+
+ return now;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Change a task's ->cfs_rq if it moves across CPUs */
static inline void set_task_cfs_rq(struct task_struct *p)
INIT_LIST_HEAD(&p->run_list);
p->se.on_rq = 0;
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+
/*
* We mark the process as running here, but have not actually
* inserted it onto the runqueue yet. This guarantees that
task_rq_unlock(rq, &flags);
}
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+/**
+ * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+ hlist_del(¬ifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ struct preempt_notifier *notifier;
+ struct hlist_node *node;
+
+ hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+ notifier->ops->sched_out(notifier, next);
+}
+
+#else
+
+static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+}
+
+#endif
+
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
+ * @prev: the current task that is being switched out
* @next: the task we are going to switch to.
*
* This is called with the rq lock held and interrupts off. It must
* prepare_task_switch sets up locking and calls architecture specific
* hooks.
*/
-static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+ struct task_struct *next)
{
+ fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
}
prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
+ fire_sched_in_preempt_notifiers(current);
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
{
struct mm_struct *mm, *oldmm;
- prepare_task_switch(rq, next);
+ prepare_task_switch(rq, prev, next);
mm = next->mm;
oldmm = prev->active_mm;
/*
rq = cpu_rq(i);
- if (*sd_idle && !idle_cpu(i))
+ if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
- * domains.
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
*/
- if (local_group && balance_cpu != this_cpu && balance) {
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
*balance = 0;
goto ret;
}
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
+ int all_pinned = 0;
cpumask_t cpus = CPU_MASK_ALL;
/*
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
minus_1_or_zero(busiest->nr_running),
- imbalance, sd, CPU_NEWLY_IDLE, NULL);
+ imbalance, sd, CPU_NEWLY_IDLE,
+ &all_pinned);
spin_unlock(&busiest->lock);
- if (!nr_moved) {
+ if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
if (!cpus_empty(cpus))
goto redo;
if (!next)
break;
migrate_dead(dead_cpu, next);
+
}
}
#endif /* CONFIG_HOTPLUG_CPU */
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+ {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },
+ {0,},
+};
+
+static struct ctl_table sd_ctl_root[] = {
+ {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },
+ {0,},
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+ struct ctl_table *entry =
+ kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
+
+ BUG_ON(!entry);
+ memset(entry, 0, n * sizeof(struct ctl_table));
+
+ return entry;
+}
+
+static void
+set_table_entry(struct ctl_table *entry, int ctl_name,
+ const char *procname, void *data, int maxlen,
+ mode_t mode, proc_handler *proc_handler)
+{
+ entry->ctl_name = ctl_name;
+ entry->procname = procname;
+ entry->data = data;
+ entry->maxlen = maxlen;
+ entry->mode = mode;
+ entry->proc_handler = proc_handler;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+ set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
+ sizeof(long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time,
+ sizeof(long long), 0644, proc_doulongvec_minmax);
+ set_table_entry(&table[10], 11, "cache_nice_tries",
+ &sd->cache_nice_tries,
+ sizeof(int), 0644, proc_dointvec_minmax);
+ set_table_entry(&table[12], 13, "flags", &sd->flags,
+ sizeof(int), 0644, proc_dointvec_minmax);
+
+ return table;
+}
+
+static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+ struct ctl_table *entry, *table;
+ struct sched_domain *sd;
+ int domain_num = 0, i;
+ char buf[32];
+
+ for_each_domain(cpu, sd)
+ domain_num++;
+ entry = table = sd_alloc_ctl_entry(domain_num + 1);
+
+ i = 0;
+ for_each_domain(cpu, sd) {
+ snprintf(buf, 32, "domain%d", i);
+ entry->ctl_name = i + 1;
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0755;
+ entry->child = sd_alloc_ctl_domain_table(sd);
+ entry++;
+ i++;
+ }
+ return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+static void init_sched_domain_sysctl(void)
+{
+ int i, cpu_num = num_online_cpus();
+ struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ char buf[32];
+
+ sd_ctl_dir[0].child = entry;
+
+ for (i = 0; i < cpu_num; i++, entry++) {
+ snprintf(buf, 32, "cpu%d", i);
+ entry->ctl_name = i + 1;
+ entry->procname = kstrdup(buf, GFP_KERNEL);
+ entry->mode = 0755;
+ entry->child = sd_alloc_ctl_cpu_table(i);
+ }
+ sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+#else
+static void init_sched_domain_sysctl(void)
+{
+}
+#endif
+
/*
* migration_call - callback that gets triggered when a CPU is added.
* Here we can start up the necessary migration thread for the new CPU.
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
+ init_sched_domain_sysctl();
+
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
set_load_weight(&init_task);
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
#ifdef CONFIG_SMP
nr_cpu_ids = highest_cpu + 1;
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);