]> Pileus Git - ~andy/linux/commitdiff
sched: Fix /proc/sched_debug failure on very very large systems
authorNathan Zimmer <nzimmer@sgi.com>
Thu, 21 Feb 2013 23:15:09 +0000 (15:15 -0800)
committerIngo Molnar <mingo@kernel.org>
Fri, 22 Feb 2013 09:27:25 +0000 (10:27 +0100)
On systems with 4096 cores attemping to read /proc/sched_debug
fails because we are trying to push all the data into a single
kmalloc buffer.

The issue is on these very large machines all the data will not
fit in 4mb.

A better solution is to not us the single_open mechanism but to
provide our own seq_operations and treat each cpu as an
individual record.

The output should be identical to the previous version.

Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Peter Zijlstra <peterz@infradead.org>)
[ Whitespace fixlet]
[ Fix spello in comment]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/sched/debug.c

index 7ae4c4c5420e65b4aea1625d37a8a28b663072fb..c496eb3c6459874411caea0a8f9481c66763a8f6 100644 (file)
@@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu)
        {
                unsigned int freq = cpu_khz ? : 1;
 
-               SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+               SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
                           cpu, freq / 1000, (freq % 1000));
        }
 #else
-       SEQ_printf(m, "\ncpu#%d\n", cpu);
+       SEQ_printf(m, "cpu#%d\n", cpu);
 #endif
 
 #define P(x)                                                           \
@@ -330,6 +330,7 @@ do {                                                                        \
        print_rq(m, rq, cpu);
        rcu_read_unlock();
        spin_unlock_irqrestore(&sched_debug_lock, flags);
+       SEQ_printf(m, "\n");
 }
 
 static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = {
        "linear"
 };
 
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
 {
        u64 ktime, sched_clk, cpu_clk;
        unsigned long flags;
-       int cpu;
 
        local_irq_save(flags);
        ktime = ktime_to_ns(ktime_get());
@@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
 
-       SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+       SEQ_printf(m, "  .%-40s: %d (%s)\n",
+               "sysctl_sched_tunable_scaling",
                sysctl_sched_tunable_scaling,
                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+       SEQ_printf(m, "\n");
+}
 
-       for_each_online_cpu(cpu)
-               print_cpu(m, cpu);
+static int sched_debug_show(struct seq_file *m, void *v)
+{
+       int cpu = (unsigned long)(v - 2);
 
-       SEQ_printf(m, "\n");
+       if (cpu != -1)
+               print_cpu(m, cpu);
+       else
+               sched_debug_header(m);
 
        return 0;
 }
 
 void sysrq_sched_debug_show(void)
 {
-       sched_debug_show(NULL, NULL);
+       int cpu;
+
+       sched_debug_header(NULL);
+       for_each_online_cpu(cpu)
+               print_cpu(NULL, cpu);
+
+}
+
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+       unsigned long n = *offset;
+
+       if (n == 0)
+               return (void *) 1;
+
+       n--;
+
+       if (n > 0)
+               n = cpumask_next(n - 1, cpu_online_mask);
+       else
+               n = cpumask_first(cpu_online_mask);
+
+       *offset = n + 1;
+
+       if (n < nr_cpu_ids)
+               return (void *)(unsigned long)(n + 2);
+       return NULL;
+}
+
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+       (*offset)++;
+       return sched_debug_start(file, offset);
+}
+
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations sched_debug_sops = {
+       .start = sched_debug_start,
+       .next = sched_debug_next,
+       .stop = sched_debug_stop,
+       .show = sched_debug_show,
+};
+
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+       seq_release(inode, file);
+
+       return 0;
 }
 
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
-       return single_open(filp, sched_debug_show, NULL);
+       int ret = 0;
+
+       ret = seq_open(filp, &sched_debug_sops);
+
+       return ret;
 }
 
 static const struct file_operations sched_debug_fops = {
        .open           = sched_debug_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-       .release        = single_release,
+       .release        = sched_debug_release,
 };
 
 static int __init init_sched_debug_procfs(void)