]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'linus' into cpus4096
authorIngo Molnar <mingo@elte.hu>
Thu, 18 Dec 2008 10:48:30 +0000 (11:48 +0100)
committerIngo Molnar <mingo@elte.hu>
Thu, 18 Dec 2008 10:48:30 +0000 (11:48 +0100)
376 files changed:
Documentation/controllers/cpuacct.txt [new file with mode: 0644]
Documentation/ftrace.txt
Documentation/kernel-parameters.txt
Documentation/lockstat.txt
Documentation/markers.txt
Documentation/scheduler/sched-arch.txt
Documentation/tracepoints.txt
arch/alpha/include/asm/smp.h
arch/alpha/kernel/irq.c
arch/alpha/kernel/process.c
arch/alpha/kernel/smp.c
arch/alpha/kernel/sys_dp264.c
arch/alpha/kernel/sys_titan.c
arch/arm/common/gic.c
arch/arm/kernel/irq.c
arch/arm/kernel/smp.c
arch/arm/mach-at91/at91rm9200_time.c
arch/arm/mach-at91/at91sam926x_time.c
arch/arm/mach-davinci/time.c
arch/arm/mach-imx/time.c
arch/arm/mach-ixp4xx/common.c
arch/arm/mach-msm/timer.c
arch/arm/mach-ns9xxx/time-ns9360.c
arch/arm/mach-omap1/time.c
arch/arm/mach-omap1/timer32k.c
arch/arm/mach-omap2/timer-gp.c
arch/arm/mach-pxa/time.c
arch/arm/mach-realview/core.c
arch/arm/mach-realview/localtimer.c
arch/arm/mach-sa1100/time.c
arch/arm/mach-versatile/core.c
arch/arm/oprofile/op_model_mpcore.c
arch/arm/plat-mxc/time.c
arch/arm/plat-orion/time.c
arch/avr32/kernel/time.c
arch/blackfin/kernel/time-ts.c
arch/cris/arch-v32/kernel/irq.c
arch/cris/arch-v32/kernel/smp.c
arch/cris/include/asm/smp.h
arch/ia64/Kconfig
arch/ia64/hp/sim/hpsim_irq.c
arch/ia64/include/asm/smp.h
arch/ia64/include/asm/topology.h
arch/ia64/kernel/iosapic.c
arch/ia64/kernel/irq.c
arch/ia64/kernel/msi_ia64.c
arch/ia64/kernel/smpboot.c
arch/ia64/kernel/topology.c
arch/ia64/sn/kernel/irq.c
arch/ia64/sn/kernel/msi_sn.c
arch/m32r/Kconfig
arch/m32r/kernel/smpboot.c
arch/m68knommu/platform/coldfire/pit.c
arch/mips/Kconfig
arch/mips/include/asm/irq.h
arch/mips/include/asm/mach-ip27/topology.h
arch/mips/include/asm/smp.h
arch/mips/jazz/irq.c
arch/mips/kernel/cevt-bcm1480.c
arch/mips/kernel/cevt-ds1287.c
arch/mips/kernel/cevt-gt641xx.c
arch/mips/kernel/cevt-r4k.c
arch/mips/kernel/cevt-sb1250.c
arch/mips/kernel/cevt-smtc.c
arch/mips/kernel/cevt-txx9.c
arch/mips/kernel/i8253.c
arch/mips/kernel/irq-gic.c
arch/mips/kernel/smp-cmp.c
arch/mips/kernel/smp-mt.c
arch/mips/kernel/smp.c
arch/mips/kernel/smtc.c
arch/mips/mti-malta/malta-smtc.c
arch/mips/nxp/pnx8550/common/time.c
arch/mips/pmc-sierra/yosemite/smp.c
arch/mips/sgi-ip27/ip27-smp.c
arch/mips/sgi-ip27/ip27-timer.c
arch/mips/sibyte/bcm1480/irq.c
arch/mips/sibyte/bcm1480/smp.c
arch/mips/sibyte/sb1250/irq.c
arch/mips/sibyte/sb1250/smp.c
arch/mips/sni/time.c
arch/parisc/Kconfig
arch/parisc/kernel/irq.c
arch/parisc/kernel/smp.c
arch/powerpc/Kconfig
arch/powerpc/include/asm/ftrace.h
arch/powerpc/include/asm/module.h
arch/powerpc/include/asm/topology.h
arch/powerpc/kernel/Makefile
arch/powerpc/kernel/entry_32.S
arch/powerpc/kernel/entry_64.S
arch/powerpc/kernel/ftrace.c
arch/powerpc/kernel/idle.c
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/module_32.c
arch/powerpc/kernel/module_64.c
arch/powerpc/kernel/smp.c
arch/powerpc/kernel/time.c
arch/powerpc/lib/Makefile
arch/powerpc/platforms/pseries/xics.c
arch/powerpc/sysdev/mpic.c
arch/powerpc/sysdev/mpic.h
arch/s390/Kconfig
arch/s390/kernel/smp.c
arch/s390/kernel/time.c
arch/s390/kernel/topology.c
arch/sh/include/asm/smp.h
arch/sh/include/asm/topology.h
arch/sh/kernel/smp.c
arch/sh/kernel/timers/timer-broadcast.c
arch/sh/kernel/timers/timer-tmu.c
arch/sparc/include/asm/smp_32.h
arch/sparc/kernel/smp.c
arch/sparc/kernel/sparc_ksyms.c
arch/sparc64/kernel/irq.c
arch/sparc64/kernel/of_device.c
arch/sparc64/kernel/pci_msi.c
arch/sparc64/kernel/smp.c
arch/sparc64/kernel/time.c
arch/um/include/asm/system.h
arch/um/kernel/smp.c
arch/um/kernel/time.c
arch/x86/Kconfig
arch/x86/Kconfig.cpu
arch/x86/Kconfig.debug
arch/x86/include/asm/apic.h
arch/x86/include/asm/bigsmp/apic.h
arch/x86/include/asm/bigsmp/ipi.h
arch/x86/include/asm/ds.h
arch/x86/include/asm/emergency-restart.h
arch/x86/include/asm/es7000/apic.h
arch/x86/include/asm/es7000/ipi.h
arch/x86/include/asm/es7000/wakecpu.h
arch/x86/include/asm/ftrace.h
arch/x86/include/asm/genapic_32.h
arch/x86/include/asm/genapic_64.h
arch/x86/include/asm/io_apic.h
arch/x86/include/asm/ipi.h
arch/x86/include/asm/irq.h
arch/x86/include/asm/irq_vectors.h
arch/x86/include/asm/kexec.h
arch/x86/include/asm/mach-default/mach_apic.h
arch/x86/include/asm/mach-default/mach_ipi.h
arch/x86/include/asm/mach-default/mach_wakecpu.h
arch/x86/include/asm/mach-default/smpboot_hooks.h
arch/x86/include/asm/mach-generic/mach_apic.h
arch/x86/include/asm/mach-generic/mach_wakecpu.h [new file with mode: 0644]
arch/x86/include/asm/numaq/apic.h
arch/x86/include/asm/numaq/ipi.h
arch/x86/include/asm/numaq/wakecpu.h
arch/x86/include/asm/reboot.h
arch/x86/include/asm/setup.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/summit/apic.h
arch/x86/include/asm/summit/ipi.h
arch/x86/include/asm/system.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/topology.h
arch/x86/include/asm/uaccess.h
arch/x86/include/asm/uaccess_32.h
arch/x86/include/asm/uaccess_64.h
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/apic.c
arch/x86/kernel/apm_32.c
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/mce_amd_64.c
arch/x86/kernel/crash.c
arch/x86/kernel/ds.c
arch/x86/kernel/dumpstack.c [new file with mode: 0644]
arch/x86/kernel/dumpstack.h [new file with mode: 0644]
arch/x86/kernel/dumpstack_32.c
arch/x86/kernel/dumpstack_64.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/es7000_32.c
arch/x86/kernel/ftrace.c
arch/x86/kernel/genapic_64.c
arch/x86/kernel/genapic_flat_64.c
arch/x86/kernel/genx2apic_cluster.c
arch/x86/kernel/genx2apic_phys.c
arch/x86/kernel/genx2apic_uv_x.c
arch/x86/kernel/hpet.c
arch/x86/kernel/i8253.c
arch/x86/kernel/io_apic.c
arch/x86/kernel/ipi.c
arch/x86/kernel/irq.c
arch/x86/kernel/irq_32.c
arch/x86/kernel/irq_64.c
arch/x86/kernel/irqinit_32.c
arch/x86/kernel/irqinit_64.c
arch/x86/kernel/machine_kexec_32.c
arch/x86/kernel/mfgpt_32.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/numaq_32.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/reboot.c
arch/x86/kernel/relocate_kernel_32.S
arch/x86/kernel/setup.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/smp.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/stacktrace.c
arch/x86/kernel/tlb_32.c
arch/x86/kernel/tlb_64.c
arch/x86/kernel/vmiclock_32.c
arch/x86/kernel/vsyscall_64.c
arch/x86/lguest/boot.c
arch/x86/lib/usercopy_32.c
arch/x86/lib/usercopy_64.c
arch/x86/mach-generic/bigsmp.c
arch/x86/mach-generic/default.c
arch/x86/mach-generic/es7000.c
arch/x86/mach-generic/numaq.c
arch/x86/mach-generic/probe.c
arch/x86/mach-generic/summit.c
arch/x86/mach-voyager/voyager_smp.c
arch/x86/mm/Makefile
arch/x86/mm/fault.c
arch/x86/mm/numa_64.c
arch/x86/mm/srat_64.c
arch/x86/pci/direct.c
arch/x86/pci/pci.h
arch/x86/vdso/vclock_gettime.c
arch/x86/xen/mmu.c
arch/x86/xen/smp.c
arch/x86/xen/suspend.c
arch/x86/xen/time.c
arch/x86/xen/xen-ops.h
block/Kconfig
block/blk-core.c
block/blktrace.c
block/elevator.c
drivers/base/cpu.c
drivers/base/node.c
drivers/base/topology.c
drivers/char/random.c
drivers/char/sysrq.c
drivers/clocksource/tcb_clksrc.c
drivers/md/dm.c
drivers/parisc/iosapic.c
drivers/pci/intr_remapping.c
drivers/pci/msi.c
drivers/pci/pci-sysfs.c
drivers/pci/probe.c
drivers/xen/events.c
fs/bio.c
fs/proc/stat.c
fs/seq_file.c
include/asm-generic/topology.h
include/asm-generic/vmlinux.lds.h
include/asm-m32r/smp.h
include/asm-m32r/system.h
include/linux/Kbuild
include/linux/blktrace_api.h
include/linux/clockchips.h
include/linux/compiler.h
include/linux/cpumask.h
include/linux/debug_locks.h
include/linux/ftrace.h
include/linux/ftrace_irq.h [new file with mode: 0644]
include/linux/futex.h
include/linux/hardirq.h
include/linux/interrupt.h
include/linux/irq.h
include/linux/irqnr.h
include/linux/kernel.h
include/linux/kernel_stat.h
include/linux/kexec.h
include/linux/lockdep.h
include/linux/marker.h
include/linux/msi.h
include/linux/mutex.h
include/linux/pid.h
include/linux/random.h
include/linux/rcuclassic.h
include/linux/rcupdate.h
include/linux/ring_buffer.h
include/linux/sched.h
include/linux/seq_file.h
include/linux/stacktrace.h
include/linux/topology.h
include/linux/tracepoint.h
include/linux/tty.h
include/linux/uaccess.h
include/trace/block.h [new file with mode: 0644]
include/trace/boot.h [new file with mode: 0644]
include/trace/sched.h
init/Kconfig
init/main.c
kernel/Makefile
kernel/cpu.c
kernel/cpuset.c
kernel/exit.c
kernel/extable.c
kernel/fork.c
kernel/futex.c
kernel/irq/Makefile
kernel/irq/autoprobe.c
kernel/irq/chip.c
kernel/irq/handle.c
kernel/irq/internals.h
kernel/irq/manage.c
kernel/irq/migration.c
kernel/irq/numa_migrate.c [new file with mode: 0644]
kernel/irq/proc.c
kernel/irq/spurious.c
kernel/kthread.c
kernel/lockdep.c
kernel/lockdep_proc.c
kernel/marker.c
kernel/module.c
kernel/mutex.c
kernel/notifier.c
kernel/posix-cpu-timers.c
kernel/power/disk.c
kernel/power/main.c
kernel/profile.c
kernel/rcuclassic.c
kernel/sched.c
kernel/sched_cpupri.c
kernel/sched_cpupri.h
kernel/sched_debug.c
kernel/sched_fair.c
kernel/sched_rt.c
kernel/sched_stats.h
kernel/signal.c
kernel/softlockup.c
kernel/sys.c
kernel/sysctl.c
kernel/taskstats.c
kernel/time/clockevents.c
kernel/time/tick-broadcast.c
kernel/time/tick-common.c
kernel/time/tick-sched.c
kernel/trace/Kconfig
kernel/trace/Makefile
kernel/trace/ftrace.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_boot.c
kernel/trace/trace_branch.c [new file with mode: 0644]
kernel/trace/trace_bts.c [new file with mode: 0644]
kernel/trace/trace_functions.c
kernel/trace/trace_functions_graph.c [new file with mode: 0644]
kernel/trace/trace_irqsoff.c
kernel/trace/trace_mmiotrace.c
kernel/trace/trace_nop.c
kernel/trace/trace_power.c [new file with mode: 0644]
kernel/trace/trace_sched_switch.c
kernel/trace/trace_sched_wakeup.c
kernel/trace/trace_selftest.c
kernel/trace/trace_stack.c
kernel/trace/trace_sysprof.c
kernel/tracepoint.c
kernel/user.c
lib/Kconfig
lib/Kconfig.debug
mm/bounce.c
mm/memory.c
mm/slub.c
samples/tracepoints/tp-samples-trace.h
samples/tracepoints/tracepoint-probe-sample.c
samples/tracepoints/tracepoint-probe-sample2.c
samples/tracepoints/tracepoint-sample.c
scripts/Makefile.build
scripts/bootgraph.pl
scripts/recordmcount.pl
scripts/trace/power.pl [new file with mode: 0644]
scripts/tracing/draw_functrace.py [new file with mode: 0644]

diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt
new file mode 100644 (file)
index 0000000..bb775fb
--- /dev/null
@@ -0,0 +1,32 @@
+CPU Accounting Controller
+-------------------------
+
+The CPU accounting controller is used to group tasks using cgroups and
+account the CPU usage of these groups of tasks.
+
+The CPU accounting controller supports multi-hierarchy groups. An accounting
+group accumulates the CPU usage of all of its child groups and the tasks
+directly present in its group.
+
+Accounting groups can be created by first mounting the cgroup filesystem.
+
+# mkdir /cgroups
+# mount -t cgroup -ocpuacct none /cgroups
+
+With the above step, the initial or the parent accounting group
+becomes visible at /cgroups. At bootup, this group includes all the
+tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
+/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
+this group which is essentially the CPU time obtained by all the tasks
+in the system.
+
+New accounting groups can be created under the parent group /cgroups.
+
+# cd /cgroups
+# mkdir g1
+# echo $$ > g1
+
+The above steps create a new group g1 and move the current shell
+process (bash) into it. CPU time consumed by this bash and its children
+can be obtained from g1/cpuacct.usage and the same is accumulated in
+/cgroups/cpuacct.usage also.
index 9cc4d685dde583464cbfbd9c7448358c2de90ef3..803b1318b13da11242a3558123d28293d3ee4378 100644 (file)
@@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files:
                tracer is not adding more data, they will display
                the same information every time they are read.
 
-  iter_ctrl: This file lets the user control the amount of data
+  trace_options: This file lets the user control the amount of data
                that is displayed in one of the above output
                files.
 
@@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files:
                only be recorded if the latency is greater than
                the value in this file. (in microseconds)
 
-  trace_entries: This sets or displays the number of bytes each CPU
+  buffer_size_kb: This sets or displays the number of kilobytes each CPU
                buffer can hold. The tracer buffers are the same size
                for each CPU. The displayed number is the size of the
-                CPU buffer and not total size of all buffers. The
+               CPU buffer and not total size of all buffers. The
                trace buffers are allocated in pages (blocks of memory
                that the kernel uses for allocation, usually 4 KB in size).
                If the last page allocated has room for more bytes
@@ -127,6 +127,8 @@ of ftrace. Here is a list of some of the key files:
                be traced. If a function exists in both set_ftrace_filter
                and set_ftrace_notrace, the function will _not_ be traced.
 
+  set_ftrace_pid: Have the function tracer only trace a single thread.
+
   available_filter_functions: This lists the functions that ftrace
                has processed and can trace. These are the function
                names that you can pass to "set_ftrace_filter" or
@@ -316,23 +318,23 @@ The above is mostly meaningful for kernel developers.
   The rest is the same as the 'trace' file.
 
 
-iter_ctrl
----------
+trace_options
+-------------
 
-The iter_ctrl file is used to control what gets printed in the trace
+The trace_options file is used to control what gets printed in the trace
 output. To see what is available, simply cat the file:
 
-  cat /debug/tracing/iter_ctrl
+  cat /debug/tracing/trace_options
   print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
- noblock nostacktrace nosched-tree
+ noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
 
 To disable one of the options, echo in the option prepended with "no".
 
-  echo noprint-parent > /debug/tracing/iter_ctrl
+  echo noprint-parent > /debug/tracing/trace_options
 
 To enable an option, leave off the "no".
 
-  echo sym-offset > /debug/tracing/iter_ctrl
+  echo sym-offset > /debug/tracing/trace_options
 
 Here are the available options:
 
@@ -378,6 +380,20 @@ Here are the available options:
                When a trace is recorded, so is the stack of functions.
                This allows for back traces of trace sites.
 
+  userstacktrace - This option changes the trace.
+                  It records a stacktrace of the current userspace thread.
+
+  sym-userobj - when user stacktrace are enabled, look up which object the
+               address belongs to, and print a relative address
+               This is especially useful when ASLR is on, otherwise you don't
+               get a chance to resolve the address to object/file/line after the app is no
+               longer running
+
+               The lookup is performed when you read trace,trace_pipe,latency_trace. Example:
+
+               a.out-1623  [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
+x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
+
   sched-tree - TBD (any users??)
 
 
@@ -1059,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else,
 a search through /proc/mounts may be needed to find where the debugfs
 file-system is mounted.
 
+
+Single thread tracing
+---------------------
+
+By writing into /debug/tracing/set_ftrace_pid you can trace a
+single thread. For example:
+
+# cat /debug/tracing/set_ftrace_pid
+no pid
+# echo 3111 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/set_ftrace_pid
+3111
+# echo function > /debug/tracing/current_tracer
+# cat /debug/tracing/trace | head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+     yum-updatesd-3111  [003]  1637.254676: finish_task_switch <-thread_return
+     yum-updatesd-3111  [003]  1637.254681: hrtimer_cancel <-schedule_hrtimeout_range
+     yum-updatesd-3111  [003]  1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel
+     yum-updatesd-3111  [003]  1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
+     yum-updatesd-3111  [003]  1637.254685: fget_light <-do_sys_poll
+     yum-updatesd-3111  [003]  1637.254686: pipe_poll <-do_sys_poll
+# echo -1 > /debug/tracing/set_ftrace_pid
+# cat /debug/tracing/trace |head
+ # tracer: function
+ #
+ #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
+ #              | |       |          |         |
+ ##### CPU 3 buffer started ####
+     yum-updatesd-3111  [003]  1701.957688: free_poll_entry <-poll_freewait
+     yum-updatesd-3111  [003]  1701.957689: remove_wait_queue <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957691: fput <-free_poll_entry
+     yum-updatesd-3111  [003]  1701.957692: audit_syscall_exit <-sysret_audit
+     yum-updatesd-3111  [003]  1701.957693: path_put <-audit_syscall_exit
+
+If you want to trace a function when executing, you could use
+something like this simple program:
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main (int argc, char **argv)
+{
+        if (argc < 1)
+                exit(-1);
+
+        if (fork() > 0) {
+                int fd, ffd;
+                char line[64];
+                int s;
+
+                ffd = open("/debug/tracing/current_tracer", O_WRONLY);
+                if (ffd < 0)
+                        exit(-1);
+                write(ffd, "nop", 3);
+
+                fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
+                s = sprintf(line, "%d\n", getpid());
+                write(fd, line, s);
+
+                write(ffd, "function", 8);
+
+                close(fd);
+                close(ffd);
+
+                execvp(argv[1], argv+1);
+        }
+
+        return 0;
+}
+
 dynamic ftrace
 --------------
 
@@ -1158,7 +1251,11 @@ These are the only wild cards which are supported.
 
   <match>*<match> will not work.
 
- # echo hrtimer_* > /debug/tracing/set_ftrace_filter
+Note: It is better to use quotes to enclose the wild cards, otherwise
+  the shell may expand the parameters into names of files in the local
+  directory.
+
+ # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
 
 Produces:
 
@@ -1213,7 +1310,7 @@ Again, now we want to append.
  # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
 sys_nanosleep
- # echo hrtimer_* >> /debug/tracing/set_ftrace_filter
+ # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter
  # cat /debug/tracing/set_ftrace_filter
 hrtimer_run_queues
 hrtimer_run_pending
@@ -1299,41 +1396,29 @@ trace entries
 -------------
 
 Having too much or not enough data can be troublesome in diagnosing
-an issue in the kernel. The file trace_entries is used to modify
+an issue in the kernel. The file buffer_size_kb is used to modify
 the size of the internal trace buffers. The number listed
 is the number of entries that can be recorded per CPU. To know
 the full size, multiply the number of possible CPUS with the
 number of entries.
 
- # cat /debug/tracing/trace_entries
-65620
+ # cat /debug/tracing/buffer_size_kb
+1408 (units kilobytes)
 
 Note, to modify this, you must have tracing completely disabled. To do that,
 echo "nop" into the current_tracer. If the current_tracer is not set
 to "nop", an EINVAL error will be returned.
 
  # echo nop > /debug/tracing/current_tracer
- # echo 100000 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
-100045
-
-
-Notice that we echoed in 100,000 but the size is 100,045. The entries
-are held in individual pages. It allocates the number of pages it takes
-to fulfill the request. If more entries may fit on the last page
-then they will be added.
-
- # echo 1 > /debug/tracing/trace_entries
- # cat /debug/tracing/trace_entries
-85
-
-This shows us that 85 entries can fit in a single page.
+ # echo 10000 > /debug/tracing/buffer_size_kb
+ # cat /debug/tracing/buffer_size_kb
+10000 (units kilobytes)
 
 The number of pages which will be allocated is limited to a percentage
 of available memory. Allocating too much will produce an error.
 
- # echo 1000000000000 > /debug/tracing/trace_entries
+ # echo 1000000000000 > /debug/tracing/buffer_size_kb
 -bash: echo: write error: Cannot allocate memory
- # cat /debug/tracing/trace_entries
+ # cat /debug/tracing/buffer_size_kb
 85
 
index e0f346d201edb70fae654c55f6be842c4465a5ff..2919a2e919388cd572a7e99559c90e92dc544a28 100644 (file)
@@ -750,6 +750,14 @@ and is between 256 and 4096 characters. It is defined in the file
                        parameter will force ia64_sal_cache_flush to call
                        ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
+       ftrace=[tracer]
+                       [ftrace] will set and start the specified tracer
+                       as early as possible in order to facilitate early
+                       boot debugging.
+
+       ftrace_dump_on_oops
+                       [ftrace] will dump the trace buffers on oops.
+
        gamecon.map[2|3]=
                        [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
                        support via parallel port (up to 5 devices per port)
index 4ba4664ce5c315d024c3bc7fd515e44ac92d4653..9cb9138f7a79bcd67bf4ac2f69741e812b5a14c2 100644 (file)
@@ -71,35 +71,50 @@ Look at the current lock statistics:
 
 # less /proc/lock_stat
 
-01 lock_stat version 0.2
+01 lock_stat version 0.3
 02 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 03                               class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
 04 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 05
-06               &inode->i_data.tree_lock-W:            15          21657           0.18     1093295.30 11547131054.85             58          10415           0.16          87.51        6387.60
-07               &inode->i_data.tree_lock-R:             0              0           0.00           0.00           0.00          23302         231198           0.25           8.45       98023.38
-08               --------------------------
-09                 &inode->i_data.tree_lock              0          [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190
-10
-11 ...............................................................................................................................................................................................
-12
-13                              dcache_lock:          1037           1161           0.38          45.32         774.51           6611         243371           0.15         306.48       77387.24
-14                              -----------
-15                              dcache_lock            180          [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230
-16                              dcache_lock            165          [<ffffffff802c002a>] d_alloc+0x15a/0x210
-17                              dcache_lock             33          [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70
-18                              dcache_lock              1          [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130
+06                          &mm->mmap_sem-W:           233            538 18446744073708       22924.27      607243.51           1342          45806           1.71        8595.89     1180582.34
+07                          &mm->mmap_sem-R:           205            587 18446744073708       28403.36      731975.00           1940         412426           0.58      187825.45     6307502.88
+08                          ---------------
+09                            &mm->mmap_sem            487          [<ffffffff8053491f>] do_page_fault+0x466/0x928
+10                            &mm->mmap_sem            179          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
+11                            &mm->mmap_sem            279          [<ffffffff80210a57>] sys_mmap+0x75/0xce
+12                            &mm->mmap_sem             76          [<ffffffff802a490b>] sys_munmap+0x32/0x59
+13                          ---------------
+14                            &mm->mmap_sem            270          [<ffffffff80210a57>] sys_mmap+0x75/0xce
+15                            &mm->mmap_sem            431          [<ffffffff8053491f>] do_page_fault+0x466/0x928
+16                            &mm->mmap_sem            138          [<ffffffff802a490b>] sys_munmap+0x32/0x59
+17                            &mm->mmap_sem            145          [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
+18
+19 ...............................................................................................................................................................................................
+20
+21                              dcache_lock:           621            623           0.52         118.26        1053.02           6745          91930           0.29         316.29      118423.41
+22                              -----------
+23                              dcache_lock            179          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
+24                              dcache_lock            113          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
+25                              dcache_lock             99          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
+26                              dcache_lock            104          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
+27                              -----------
+28                              dcache_lock            192          [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
+29                              dcache_lock             98          [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
+30                              dcache_lock             72          [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
+31                              dcache_lock            112          [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
 
 This excerpt shows the first two lock class statistics. Line 01 shows the
 output version - each time the format changes this will be updated. Line 02-04
-show the header with column descriptions. Lines 05-10 and 13-18 show the actual
+show the header with column descriptions. Lines 05-18 and 20-31 show the actual
 statistics. These statistics come in two parts; the actual stats separated by a
-short separator (line 08, 14) from the contention points.
+short separator (line 08, 13) from the contention points.
 
-The first lock (05-10) is a read/write lock, and shows two lines above the
+The first lock (05-18) is a read/write lock, and shows two lines above the
 short separator. The contention points don't match the column descriptors,
-they have two: contentions and [<IP>] symbol.
+they have two: contentions and [<IP>] symbol. The second set of contention
+points are the points we're contending with.
 
+The integer part of the time values is in us.
 
 View the top contending locks:
 
index 089f6138fcd94249a6444ca3a932a50c263098e1..d2b3d0e91b26d21423d8202668ac8d512558e7dd 100644 (file)
@@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be
 activated by calling marker_arm(). Marker deactivation can be done by calling
 marker_disarm() as many times as marker_arm() has been called. Removing a probe
 is done through marker_probe_unregister(); it will disarm the probe.
-marker_synchronize_unregister() must be called before the end of the module exit
-function to make sure there is no caller left using the probe. This, and the
-fact that preemption is disabled around the probe call, make sure that probe
-removal and module unload are safe. See the "Probe example" section below for a
-sample probe module.
+
+marker_synchronize_unregister() must be called between probe unregistration and
+the first occurrence of
+- the end of module exit function,
+  to make sure there is no caller left using the probe;
+- the free of any resource used by the probes,
+  to make sure the probes wont be accessing invalid data.
+This, and the fact that preemption is disabled around the probe call, make sure
+that probe removal and module unload are safe. See the "Probe example" section
+below for a sample probe module.
 
 The marker mechanism supports inserting multiple instances of the same marker.
 Markers can be put in inline functions, inlined static functions, and
@@ -70,6 +75,20 @@ a printk warning which identifies the inconsistency:
 
 "Format mismatch for probe probe_name (format), marker (format)"
 
+Another way to use markers is to simply define the marker without generating any
+function call to actually call into the marker. This is useful in combination
+with tracepoint probes in a scheme like this :
+
+void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk);
+
+DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name,
+       "arg1 %u pid %d");
+
+notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk)
+{
+       struct marker *marker = &GET_MARKER(kernel_irq_entry);
+       /* write data to trace buffers ... */
+}
 
 * Probe / marker example
 
index 941615a9769b91ea0cee31723023c600645db67d..d43dbcbd163b4d45e3479614149c780b5e12778f 100644 (file)
@@ -8,7 +8,7 @@ Context switch
 By default, the switch_to arch function is called with the runqueue
 locked. This is usually not a problem unless switch_to may need to
 take the runqueue lock. This is usually due to a wake up operation in
-the context switch. See include/asm-ia64/system.h for an example.
+the context switch. See arch/ia64/include/asm/system.h for an example.
 
 To request the scheduler call switch_to with the runqueue unlocked,
 you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
@@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
 introduce a significant interrupt latency by adding the line
 `#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
 unlocked context switches. This define also implies
-`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an
+`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
 example.
 
 
index 5d354e16749447c667934ecd4c8f4fe5ee3da36c..6f0a044f5b5e51e27f2bdae2cd6d68123eb98bd7 100644 (file)
@@ -3,28 +3,30 @@
                            Mathieu Desnoyers
 
 
-This document introduces Linux Kernel Tracepoints and their use. It provides
-examples of how to insert tracepoints in the kernel and connect probe functions
-to them and provides some examples of probe functions.
+This document introduces Linux Kernel Tracepoints and their use. It
+provides examples of how to insert tracepoints in the kernel and
+connect probe functions to them and provides some examples of probe
+functions.
 
 
 * Purpose of tracepoints
 
-A tracepoint placed in code provides a hook to call a function (probe) that you
-can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or
-"off" (no probe is attached). When a tracepoint is "off" it has no effect,
-except for adding a tiny time penalty (checking a condition for a branch) and
-space penalty (adding a few bytes for the function call at the end of the
-instrumented function and adds a data structure in a separate section).  When a
-tracepoint is "on", the function you provide is called each time the tracepoint
-is executed, in the execution context of the caller. When the function provided
-ends its execution, it returns to the caller (continuing from the tracepoint
-site).
+A tracepoint placed in code provides a hook to call a function (probe)
+that you can provide at runtime. A tracepoint can be "on" (a probe is
+connected to it) or "off" (no probe is attached). When a tracepoint is
+"off" it has no effect, except for adding a tiny time penalty
+(checking a condition for a branch) and space penalty (adding a few
+bytes for the function call at the end of the instrumented function
+and adds a data structure in a separate section).  When a tracepoint
+is "on", the function you provide is called each time the tracepoint
+is executed, in the execution context of the caller. When the function
+provided ends its execution, it returns to the caller (continuing from
+the tracepoint site).
 
 You can put tracepoints at important locations in the code. They are
 lightweight hooks that can pass an arbitrary number of parameters,
-which prototypes are described in a tracepoint declaration placed in a header
-file.
+which prototypes are described in a tracepoint declaration placed in a
+header file.
 
 They can be used for tracing and performance accounting.
 
@@ -42,14 +44,16 @@ In include/trace/subsys.h :
 
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(subsys_eventname,
-       TPPTOTO(int firstarg, struct task_struct *p),
+DECLARE_TRACE(subsys_eventname,
+       TPPROTO(int firstarg, struct task_struct *p),
        TPARGS(firstarg, p));
 
 In subsys/file.c (where the tracing statement must be added) :
 
 #include <trace/subsys.h>
 
+DEFINE_TRACE(subsys_eventname);
+
 void somefct(void)
 {
        ...
@@ -61,31 +65,41 @@ Where :
 - subsys_eventname is an identifier unique to your event
     - subsys is the name of your subsystem.
     - eventname is the name of the event to trace.
-- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function
-  called by this tracepoint.
-- TPARGS(firstarg, p) are the parameters names, same as found in the prototype.
 
-Connecting a function (probe) to a tracepoint is done by providing a probe
-(function to call) for the specific tracepoint through
-register_trace_subsys_eventname().  Removing a probe is done through
-unregister_trace_subsys_eventname(); it will remove the probe sure there is no
-caller left using the probe when it returns. Probe removal is preempt-safe
-because preemption is disabled around the probe call. See the "Probe example"
-section below for a sample probe module.
-
-The tracepoint mechanism supports inserting multiple instances of the same
-tracepoint, but a single definition must be made of a given tracepoint name over
-all the kernel to make sure no type conflict will occur. Name mangling of the
-tracepoints is done using the prototypes to make sure typing is correct.
-Verification of probe type correctness is done at the registration site by the
-compiler. Tracepoints can be put in inline functions, inlined static functions,
-and unrolled loops as well as regular functions.
-
-The naming scheme "subsys_event" is suggested here as a convention intended
-to limit collisions. Tracepoint names are global to the kernel: they are
-considered as being the same whether they are in the core kernel image or in
-modules.
+- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
+  function called by this tracepoint.
 
+- TPARGS(firstarg, p) are the parameters names, same as found in the
+  prototype.
+
+Connecting a function (probe) to a tracepoint is done by providing a
+probe (function to call) for the specific tracepoint through
+register_trace_subsys_eventname().  Removing a probe is done through
+unregister_trace_subsys_eventname(); it will remove the probe.
+
+tracepoint_synchronize_unregister() must be called before the end of
+the module exit function to make sure there is no caller left using
+the probe. This, and the fact that preemption is disabled around the
+probe call, make sure that probe removal and module unload are safe.
+See the "Probe example" section below for a sample probe module.
+
+The tracepoint mechanism supports inserting multiple instances of the
+same tracepoint, but a single definition must be made of a given
+tracepoint name over all the kernel to make sure no type conflict will
+occur. Name mangling of the tracepoints is done using the prototypes
+to make sure typing is correct. Verification of probe type correctness
+is done at the registration site by the compiler. Tracepoints can be
+put in inline functions, inlined static functions, and unrolled loops
+as well as regular functions.
+
+The naming scheme "subsys_event" is suggested here as a convention
+intended to limit collisions. Tracepoint names are global to the
+kernel: they are considered as being the same whether they are in the
+core kernel image or in modules.
+
+If the tracepoint has to be used in kernel modules, an
+EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
+used to export the defined tracepoints.
 
 * Probe / tracepoint example
 
index 544c69af8168b27baf40a32b34f334bad1d106a4..547e90951cec07a92bd31d6b9cf37f293612383e 100644 (file)
@@ -45,7 +45,6 @@ extern struct cpuinfo_alpha cpu_data[NR_CPUS];
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern int smp_num_cpus;
-#define cpu_possible_map       cpu_present_map
 
 extern void arch_send_call_function_single_ipi(int cpu);
 extern void arch_send_call_function_ipi(cpumask_t mask);
index c626a821cdcb9eacc089121b52d7afa6f3f5fe4f..d0f1620007f7de83d72f76257fb778791518d587 100644 (file)
@@ -55,7 +55,7 @@ int irq_select_affinity(unsigned int irq)
        last_cpu = cpu;
 
        irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-       irq_desc[irq].chip->set_affinity(irq, cpumask_of_cpu(cpu));
+       irq_desc[irq].chip->set_affinity(irq, cpumask_of(cpu));
        return 0;
 }
 #endif /* CONFIG_SMP */
index 351407e07e71a715a654936b744e498f2cbfb2e0..f238370c907d9f4c86ecbed9da7332ccf08908c3 100644 (file)
@@ -94,6 +94,7 @@ common_shutdown_1(void *generic_ptr)
                flags |= 0x00040000UL; /* "remain halted" */
                *pflags = flags;
                cpu_clear(cpuid, cpu_present_map);
+               cpu_clear(cpuid, cpu_possible_map);
                halt();
        }
 #endif
@@ -120,6 +121,7 @@ common_shutdown_1(void *generic_ptr)
 #ifdef CONFIG_SMP
        /* Wait for the secondaries to halt. */
        cpu_clear(boot_cpuid, cpu_present_map);
+       cpu_clear(boot_cpuid, cpu_possible_map);
        while (cpus_weight(cpu_present_map))
                barrier();
 #endif
index cf7da10097bb29d5f8fb69aaefc9107a3419783a..d953e510f68d71ff417206e98c76dc9c13be45ad 100644 (file)
@@ -70,11 +70,6 @@ enum ipi_message_type {
 /* Set to a secondary's cpuid when it comes online.  */
 static int smp_secondary_alive __devinitdata = 0;
 
-/* Which cpus ids came online.  */
-cpumask_t cpu_online_map;
-
-EXPORT_SYMBOL(cpu_online_map);
-
 int smp_num_probed;            /* Internal processor count */
 int smp_num_cpus = 1;          /* Number that came online.  */
 EXPORT_SYMBOL(smp_num_cpus);
@@ -440,6 +435,7 @@ setup_smp(void)
                                ((char *)cpubase + i*hwrpb->processor_size);
                        if ((cpu->flags & 0x1cc) == 0x1cc) {
                                smp_num_probed++;
+                               cpu_set(i, cpu_possible_map);
                                cpu_set(i, cpu_present_map);
                                cpu->pal_revision = boot_cpu_palrev;
                        }
@@ -473,6 +469,7 @@ smp_prepare_cpus(unsigned int max_cpus)
 
        /* Nothing to do on a UP box, or when told not to.  */
        if (smp_num_probed == 1 || max_cpus == 0) {
+               cpu_possible_map = cpumask_of_cpu(boot_cpuid);
                cpu_present_map = cpumask_of_cpu(boot_cpuid);
                printk(KERN_INFO "SMP mode deactivated.\n");
                return;
index c71b0fd7a61f8748b857463c03854006ec5b91a1..ab44c164d9d47e95ae80d84bd96fd39b9cf927a9 100644 (file)
@@ -177,19 +177,19 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 }
 
 static void
-dp264_set_affinity(unsigned int irq, cpumask_t affinity)
+dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&dp264_irq_lock);
-       cpu_set_irq_affinity(irq, affinity);
+       cpu_set_irq_affinity(irq, *affinity);
        tsunami_update_irq_hw(cached_irq_mask);
        spin_unlock(&dp264_irq_lock);
 }
 
 static void
-clipper_set_affinity(unsigned int irq, cpumask_t affinity)
+clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&dp264_irq_lock);
-       cpu_set_irq_affinity(irq - 16, affinity);
+       cpu_set_irq_affinity(irq - 16, *affinity);
        tsunami_update_irq_hw(cached_irq_mask);
        spin_unlock(&dp264_irq_lock);
 }
index 52c91ccc164866d3c08ed24ed6984d7b53e35c65..27f840a4ad3d7ec0949ac92a55ef5c7c1a3b9280 100644 (file)
@@ -158,10 +158,10 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 }
 
 static void
-titan_set_irq_affinity(unsigned int irq, cpumask_t affinity)
+titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&titan_irq_lock);
-       titan_cpu_set_irq_affinity(irq - 16, affinity);
+       titan_cpu_set_irq_affinity(irq - 16, *affinity);
        titan_update_irq_hw(titan_cached_irq_mask);
        spin_unlock(&titan_irq_lock);
 }
index 7fc9860a97d79b5dcf73cb97c6d3de9ca9530af3..c6884ba1d5ed0b91122f45072f97c99cda52a23d 100644 (file)
@@ -109,11 +109,11 @@ static void gic_unmask_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, cpumask_t mask_val)
+static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 {
        void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
        unsigned int shift = (irq % 4) * 8;
-       unsigned int cpu = first_cpu(mask_val);
+       unsigned int cpu = cpumask_first(mask_val);
        u32 val;
 
        spin_lock(&irq_controller_lock);
index 2f3eb795fa6e99a4fd1cde9911ab99070f233aad..7141cee1fab71e8b131d03e0d8e6dbfc043dbb85 100644 (file)
@@ -174,7 +174,7 @@ static void route_irq(struct irq_desc *desc, unsigned int irq, unsigned int cpu)
        pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->cpu, cpu);
 
        spin_lock_irq(&desc->lock);
-       desc->chip->set_affinity(irq, cpumask_of_cpu(cpu));
+       desc->chip->set_affinity(irq, cpumask_of(cpu));
        spin_unlock_irq(&desc->lock);
 }
 
index e42a749a56dd5c85abc823e2666ff7b4683c2da0..bd905c0a73651f85d54fffb25146b50d7d267368 100644 (file)
 #include <asm/tlbflush.h>
 #include <asm/ptrace.h>
 
-/*
- * bitmask of present and online CPUs.
- * The present bitmask indicates that the CPU is physically present.
- * The online bitmask indicates that the CPU is up and running.
- */
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-
 /*
  * as from 2.5, kernels no longer have an init_tasks structure
  * so we need some other way of telling a new secondary core
index a72e798a2a40eede144f22d85eb5204f0114ea18..72f51d39202c7a22d49a393657255d283b339889 100644 (file)
@@ -169,7 +169,6 @@ static struct clock_event_device clkevt = {
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
        .rating         = 150,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = clkevt32k_next_event,
        .set_mode       = clkevt32k_mode,
 };
@@ -197,7 +196,7 @@ void __init at91rm9200_timer_init(void)
        clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift);
        clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt);
        clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1;
-       clkevt.cpumask = cpumask_of_cpu(0);
+       clkevt.cpumask = cpumask_of(0);
        clockevents_register_device(&clkevt);
 
        /* register clocksource */
index 122fd77ed580399b632451b114b66a9458c97239..b63e1d5f1badc60ad3573febf6d9ff557f770c64 100644 (file)
@@ -91,7 +91,6 @@ static struct clock_event_device pit_clkevt = {
        .features       = CLOCK_EVT_FEAT_PERIODIC,
        .shift          = 32,
        .rating         = 100,
-       .cpumask        = CPU_MASK_CPU0,
        .set_mode       = pit_clkevt_mode,
 };
 
@@ -173,6 +172,7 @@ static void __init at91sam926x_pit_init(void)
 
        /* Set up and register clockevents */
        pit_clkevt.mult = div_sc(pit_rate, NSEC_PER_SEC, pit_clkevt.shift);
+       pit_clkevt.cpumask = cpumask_of(0);
        clockevents_register_device(&pit_clkevt);
 }
 
index 3b9a296b5c4ba43ee703b7ccafce24fdd6067d44..f8bcd29d17a624a6a1d762b0b6ec4c82ffcf59a2 100644 (file)
@@ -322,7 +322,7 @@ static void __init davinci_timer_init(void)
        clockevent_davinci.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_davinci);
 
-       clockevent_davinci.cpumask = cpumask_of_cpu(0);
+       clockevent_davinci.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_davinci);
 }
 
index a11765f5f23b111c1f21a1a4edb3f498462c1a16..aff0ebcfa847a5422051c866e4e48f34109b7f76 100644 (file)
@@ -184,7 +184,7 @@ static int __init imx_clockevent_init(unsigned long rate)
        clockevent_imx.min_delta_ns =
                clockevent_delta2ns(0xf, &clockevent_imx);
 
-       clockevent_imx.cpumask = cpumask_of_cpu(0);
+       clockevent_imx.cpumask = cpumask_of(0);
 
        clockevents_register_device(&clockevent_imx);
 
index 7766f469456be0025ea8bc353dd45ad61c530e31..f4656d2ac8a85c3bbb899d43ec7f6b6b96b74c66 100644 (file)
@@ -487,7 +487,7 @@ static int __init ixp4xx_clockevent_init(void)
                clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx);
        clockevent_ixp4xx.min_delta_ns =
                clockevent_delta2ns(0xf, &clockevent_ixp4xx);
-       clockevent_ixp4xx.cpumask = cpumask_of_cpu(0);
+       clockevent_ixp4xx.cpumask = cpumask_of(0);
 
        clockevents_register_device(&clockevent_ixp4xx);
        return 0;
index 345a14cb73c38debc9e6bf55caac2b56700dea1c..444d9c0f5ca68c30ce2a50af7bdc83ef8d0785b0 100644 (file)
@@ -182,7 +182,7 @@ static void __init msm_timer_init(void)
                        clockevent_delta2ns(0xf0000000 >> clock->shift, ce);
                /* 4 gets rounded down to 3 */
                ce->min_delta_ns = clockevent_delta2ns(4, ce);
-               ce->cpumask = cpumask_of_cpu(0);
+               ce->cpumask = cpumask_of(0);
 
                cs->mult = clocksource_hz2mult(clock->freq, cs->shift);
                res = clocksource_register(cs);
index a63424d083d95186f7b5aabbe5c55cbc4b1302ad..41df697217692e3562668af05cf2ccf9153670f7 100644 (file)
@@ -173,7 +173,7 @@ static void __init ns9360_timer_init(void)
        ns9360_clockevent_device.min_delta_ns =
                clockevent_delta2ns(1, &ns9360_clockevent_device);
 
-       ns9360_clockevent_device.cpumask = cpumask_of_cpu(0);
+       ns9360_clockevent_device.cpumask = cpumask_of(0);
        clockevents_register_device(&ns9360_clockevent_device);
 
        setup_irq(IRQ_NS9360_TIMER0 + TIMER_CLOCKEVENT,
index 2cf7e32bd293796e216a81794c2ecc785bba1297..495a32c287b49c01944f9217e94f137f7cf69d6f 100644 (file)
@@ -173,7 +173,7 @@ static __init void omap_init_mpu_timer(unsigned long rate)
        clockevent_mpu_timer1.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_mpu_timer1);
 
-       clockevent_mpu_timer1.cpumask = cpumask_of_cpu(0);
+       clockevent_mpu_timer1.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_mpu_timer1);
 }
 
index 705367ece1741b539f6838454d9cf2d14aa61bad..fd3f7396e16249f8a447876fe93bbc1f36bc049a 100644 (file)
@@ -187,7 +187,7 @@ static __init void omap_init_32k_timer(void)
        clockevent_32k_timer.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_32k_timer);
 
-       clockevent_32k_timer.cpumask = cpumask_of_cpu(0);
+       clockevent_32k_timer.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_32k_timer);
 }
 
index 589393bedade0477dbecb909d5fd82501a58203c..ae6036300f603521ce2b59267fee71d5f668e589 100644 (file)
@@ -120,7 +120,7 @@ static void __init omap2_gp_clockevent_init(void)
        clockevent_gpt.min_delta_ns =
                clockevent_delta2ns(1, &clockevent_gpt);
 
-       clockevent_gpt.cpumask = cpumask_of_cpu(0);
+       clockevent_gpt.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_gpt);
 }
 
index f8a9a62959e5bfe9cb426e75560efb17a08f7194..bf3c9a4aad509fc8fde9a6858f38e65e14bf60b9 100644 (file)
@@ -122,7 +122,6 @@ static struct clock_event_device ckevt_pxa_osmr0 = {
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
        .rating         = 200,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = pxa_osmr0_set_next_event,
        .set_mode       = pxa_osmr0_set_mode,
 };
@@ -170,6 +169,7 @@ static void __init pxa_timer_init(void)
                clockevent_delta2ns(0x7fffffff, &ckevt_pxa_osmr0);
        ckevt_pxa_osmr0.min_delta_ns =
                clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_pxa_osmr0) + 1;
+       ckevt_pxa_osmr0.cpumask = cpumask_of(0);
 
        cksrc_pxa_oscr0.mult =
                clocksource_hz2mult(clock_tick_rate, cksrc_pxa_oscr0.shift);
index 2f04d54711e7b86ad204a0916ba012e5234d91fd..b07cb9b7adb15d5f4a6580ecc9315a401184ae25 100644 (file)
@@ -511,7 +511,7 @@ static struct clock_event_device timer0_clockevent =         {
        .set_mode       = timer_set_mode,
        .set_next_event = timer_set_next_event,
        .rating         = 300,
-       .cpumask        = CPU_MASK_ALL,
+       .cpumask        = cpu_all_mask,
 };
 
 static void __init realview_clockevents_init(unsigned int timer_irq)
index 44d178cd573342b5cc3578353c18b724e9b4f1e8..504961ef343c2ef0f0e6be6bc052b997d90405fd 100644 (file)
@@ -161,7 +161,7 @@ void __cpuinit local_timer_setup(unsigned int cpu)
        clk->set_mode           = local_timer_set_mode;
        clk->set_next_event     = local_timer_set_next_event;
        clk->irq                = IRQ_LOCALTIMER;
-       clk->cpumask            = cpumask_of_cpu(cpu);
+       clk->cpumask            = cpumask_of(cpu);
        clk->shift              = 20;
        clk->mult               = div_sc(mpcore_timer_rate, NSEC_PER_SEC, clk->shift);
        clk->max_delta_ns       = clockevent_delta2ns(0xffffffff, clk);
@@ -199,7 +199,7 @@ void __cpuinit local_timer_setup(unsigned int cpu)
        clk->rating             = 200;
        clk->set_mode           = dummy_timer_set_mode;
        clk->broadcast          = smp_timer_broadcast;
-       clk->cpumask            = cpumask_of_cpu(cpu);
+       clk->cpumask            = cpumask_of(cpu);
 
        clockevents_register_device(clk);
 }
index 24c0a4bae850ec273854557d50891bfe6a2ee46b..1cac4ac0b4b89e7af0dc56116817d163744a542c 100644 (file)
@@ -73,7 +73,6 @@ static struct clock_event_device ckevt_sa1100_osmr0 = {
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
        .rating         = 200,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = sa1100_osmr0_set_next_event,
        .set_mode       = sa1100_osmr0_set_mode,
 };
@@ -110,6 +109,7 @@ static void __init sa1100_timer_init(void)
                clockevent_delta2ns(0x7fffffff, &ckevt_sa1100_osmr0);
        ckevt_sa1100_osmr0.min_delta_ns =
                clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_sa1100_osmr0) + 1;
+       ckevt_sa1100_osmr0.cpumask = cpumask_of(0);
 
        cksrc_sa1100_oscr.mult =
                clocksource_hz2mult(CLOCK_TICK_RATE, cksrc_sa1100_oscr.shift);
index 565e0ba0d67e082b9481f6008d065302a5b0042f..a3f1933434e261d604c20f665a8b2b6dde450b0b 100644 (file)
@@ -965,7 +965,7 @@ static void __init versatile_timer_init(void)
        timer0_clockevent.min_delta_ns =
                clockevent_delta2ns(0xf, &timer0_clockevent);
 
-       timer0_clockevent.cpumask = cpumask_of_cpu(0);
+       timer0_clockevent.cpumask = cpumask_of(0);
        clockevents_register_device(&timer0_clockevent);
 }
 
index 4de366e8b4c5f7fdaa900343e218c6e2cd49fcc1..6d6bd5899240862771935c47f32007fb452aec48 100644 (file)
@@ -260,10 +260,10 @@ static void em_stop(void)
 static void em_route_irq(int irq, unsigned int cpu)
 {
        struct irq_desc *desc = irq_desc + irq;
-       cpumask_t mask = cpumask_of_cpu(cpu);
+       const struct cpumask *mask = cpumask_of(cpu);
 
        spin_lock_irq(&desc->lock);
-       desc->affinity = mask;
+       desc->affinity = *mask;
        desc->chip->set_affinity(irq, mask);
        spin_unlock_irq(&desc->lock);
 }
index fd28f5194f71ad3bd1a09b05545c2d01e8ea17c6..758a1293bcfaaefa2be4191f030eb74b8dcc0ce8 100644 (file)
@@ -190,7 +190,7 @@ static int __init mxc_clockevent_init(void)
        clockevent_mxc.min_delta_ns =
                        clockevent_delta2ns(0xff, &clockevent_mxc);
 
-       clockevent_mxc.cpumask = cpumask_of_cpu(0);
+       clockevent_mxc.cpumask = cpumask_of(0);
 
        clockevents_register_device(&clockevent_mxc);
 
index 544d6b327f3a1e3f383a343ba6571316b67b3886..6fa2923e6dca388c4e445f5ccda9b327bfd1e58a 100644 (file)
@@ -149,7 +149,6 @@ static struct clock_event_device orion_clkevt = {
        .features       = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC,
        .shift          = 32,
        .rating         = 300,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = orion_clkevt_next_event,
        .set_mode       = orion_clkevt_mode,
 };
@@ -199,5 +198,6 @@ void __init orion_time_init(unsigned int irq, unsigned int tclk)
        orion_clkevt.mult = div_sc(tclk, NSEC_PER_SEC, orion_clkevt.shift);
        orion_clkevt.max_delta_ns = clockevent_delta2ns(0xfffffffe, &orion_clkevt);
        orion_clkevt.min_delta_ns = clockevent_delta2ns(1, &orion_clkevt);
+       orion_clkevt.cpumask = cpumask_of(0);
        clockevents_register_device(&orion_clkevt);
 }
index 283481d74a5bf454e2b6966810df4133d70d5cfc..0ff46bf873b034192ec557511febfe6586930ece 100644 (file)
@@ -106,7 +106,6 @@ static struct clock_event_device comparator = {
        .features       = CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 16,
        .rating         = 50,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = comparator_next_event,
        .set_mode       = comparator_mode,
 };
@@ -134,6 +133,7 @@ void __init time_init(void)
        comparator.mult = div_sc(counter_hz, NSEC_PER_SEC, comparator.shift);
        comparator.max_delta_ns = clockevent_delta2ns((u32)~0, &comparator);
        comparator.min_delta_ns = clockevent_delta2ns(50, &comparator) + 1;
+       comparator.cpumask = cpumask_of(0);
 
        sysreg_write(COMPARE, 0);
        timer_irqaction.dev_id = &comparator;
index e887efc86c29e8215d544dcba262128b65343008..0ed2badfd74613b3753c901d44f7da6cf9ebe8df 100644 (file)
@@ -162,7 +162,6 @@ static struct clock_event_device clockevent_bfin = {
        .name           = "bfin_core_timer",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .shift          = 32,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = bfin_timer_set_next_event,
        .set_mode       = bfin_timer_set_mode,
 };
@@ -193,6 +192,7 @@ static int __init bfin_clockevent_init(void)
        clockevent_bfin.mult = div_sc(timer_clk, NSEC_PER_SEC, clockevent_bfin.shift);
        clockevent_bfin.max_delta_ns = clockevent_delta2ns(-1, &clockevent_bfin);
        clockevent_bfin.min_delta_ns = clockevent_delta2ns(100, &clockevent_bfin);
+       clockevent_bfin.cpumask = cpumask_of(0);
        clockevents_register_device(&clockevent_bfin);
 
        return 0;
index 173c141ac9ba85fa04884f1ec6ab8cbf5aba14c8..295131fee710ebe97758fed49f71fc5c0a3b906c 100644 (file)
@@ -325,11 +325,11 @@ static void end_crisv32_irq(unsigned int irq)
 {
 }
 
-void set_affinity_crisv32_irq(unsigned int irq, cpumask_t dest)
+void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
 {
        unsigned long flags;
        spin_lock_irqsave(&irq_lock, flags);
-       irq_allocations[irq - FIRST_IRQ].mask = dest;
+       irq_allocations[irq - FIRST_IRQ].mask = *dest;
        spin_unlock_irqrestore(&irq_lock, flags);
 }
 
index 52e16c6436f9bac0733022622fa252d217b40369..9dac17334640a9cf3eda77c0c0a22b53e817f0a1 100644 (file)
 spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED};
 
 /* CPU masks */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
 cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
 EXPORT_SYMBOL(phys_cpu_present_map);
 
 /* Variables used during SMP boot */
index dba33aba3e95583e815becee7cfecca797d64b9b..c615a06dd757aa46ca2e35b4d141b1834fb4bbc2 100644 (file)
@@ -4,7 +4,6 @@
 #include <linux/cpumask.h>
 
 extern cpumask_t phys_cpu_present_map;
-extern cpumask_t cpu_possible_map;
 
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
index 6bd91ed7cd03a8366d35745a499ea89c9daf93a6..7fa8f615ba6e7edaec647959f8d4623fcab4bad5 100644 (file)
@@ -99,7 +99,7 @@ config GENERIC_IOMAP
        bool
        default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
        bool
        default y
 
index c2f58ff364e7144c43de3e49958bfac7b3b3b0ad..cc0a3182db3c0158d246f658065a0f0f5d426a5f 100644 (file)
@@ -22,7 +22,7 @@ hpsim_irq_noop (unsigned int irq)
 }
 
 static void
-hpsim_set_affinity_noop (unsigned int a, cpumask_t b)
+hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
 {
 }
 
index 12d96e0cd5134d9d95e01e6714b5b75179dcfcf7..21c402365d0ea7f86bf63cbf0009bb7c7e266642 100644 (file)
@@ -57,7 +57,6 @@ extern struct smp_boot_data {
 
 extern char no_int_routing __devinitdata;
 
-extern cpumask_t cpu_online_map;
 extern cpumask_t cpu_core_map[NR_CPUS];
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 extern int smp_num_siblings;
index 35bcb641c9e5731abb025c879d0acbb65a957ad6..a3cc9f65f954e7231a6fe0cba79ee5a0a22f5dc1 100644 (file)
@@ -55,7 +55,6 @@
 void build_cpu_to_node_map(void);
 
 #define SD_CPU_INIT (struct sched_domain) {            \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
@@ -80,7 +79,6 @@ void build_cpu_to_node_map(void);
 
 /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 5c4674ae8aea0279f1abba8cff8bdade994e0131..c8adecd5b4164a5e48264fc252c5dc8b43e6d9fd 100644 (file)
@@ -330,25 +330,25 @@ unmask_irq (unsigned int irq)
 
 
 static void
-iosapic_set_affinity (unsigned int irq, cpumask_t mask)
+iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
        u32 high32, low32;
-       int dest, rte_index;
+       int cpu, dest, rte_index;
        int redir = (irq & IA64_IRQ_REDIRECTED) ? 1 : 0;
        struct iosapic_rte_info *rte;
        struct iosapic *iosapic;
 
        irq &= (~IA64_IRQ_REDIRECTED);
 
-       cpus_and(mask, mask, cpu_online_map);
-       if (cpus_empty(mask))
+       cpu = cpumask_first_and(cpu_online_mask, mask);
+       if (cpu >= nr_cpu_ids)
                return;
 
-       if (irq_prepare_move(irq, first_cpu(mask)))
+       if (irq_prepare_move(irq, cpu))
                return;
 
-       dest = cpu_physical_id(first_cpu(mask));
+       dest = cpu_physical_id(cpu);
 
        if (!iosapic_intr_info[irq].count)
                return;                 /* not an IOSAPIC interrupt */
index 7fd18f54c0568980238cc75a7810ea51de4a4308..0b6db53fedcf3b05a8417d71572143bb993752da 100644 (file)
@@ -133,7 +133,6 @@ unsigned int vectors_in_migration[NR_IRQS];
  */
 static void migrate_irqs(void)
 {
-       cpumask_t       mask;
        irq_desc_t *desc;
        int             irq, new_cpu;
 
@@ -152,15 +151,14 @@ static void migrate_irqs(void)
                if (desc->status == IRQ_PER_CPU)
                        continue;
 
-               cpus_and(mask, irq_desc[irq].affinity, cpu_online_map);
-               if (any_online_cpu(mask) == NR_CPUS) {
+               if (cpumask_any_and(&irq_desc[irq].affinity, cpu_online_mask)
+                   >= nr_cpu_ids) {
                        /*
                         * Save it for phase 2 processing
                         */
                        vectors_in_migration[irq] = irq;
 
                        new_cpu = any_online_cpu(cpu_online_map);
-                       mask = cpumask_of_cpu(new_cpu);
 
                        /*
                         * Al three are essential, currently WARN_ON.. maybe panic?
@@ -168,7 +166,8 @@ static void migrate_irqs(void)
                        if (desc->chip && desc->chip->disable &&
                                desc->chip->enable && desc->chip->set_affinity) {
                                desc->chip->disable(irq);
-                               desc->chip->set_affinity(irq, mask);
+                               desc->chip->set_affinity(irq,
+                                                        cpumask_of(new_cpu));
                                desc->chip->enable(irq);
                        } else {
                                WARN_ON((!(desc->chip) || !(desc->chip->disable) ||
index 702a09c132387577fa7d3b954a499feaa59705bc..890339339035b74814721c799a024fa6d184438a 100644 (file)
 static struct irq_chip ia64_msi_chip;
 
 #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
+static void ia64_set_msi_irq_affinity(unsigned int irq,
+                                     const cpumask_t *cpu_mask)
 {
        struct msi_msg msg;
        u32 addr, data;
-       int cpu = first_cpu(cpu_mask);
+       int cpu = first_cpu(*cpu_mask);
 
        if (!cpu_online(cpu))
                return;
@@ -166,12 +167,11 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_cfg *cfg = irq_cfg + irq;
        struct msi_msg msg;
-       int cpu = first_cpu(mask);
-
+       int cpu = cpumask_first(mask);
 
        if (!cpu_online(cpu))
                return;
@@ -187,7 +187,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DESTID_CPU(cpu_physical_id(cpu));
 
        dmar_msi_write(irq, &msg);
-       irq_desc[irq].affinity = mask;
+       irq_desc[irq].affinity = *mask;
 }
 #endif /* CONFIG_SMP */
 
index 1dcbb85fc4ee9f05253d5d76b5e8033b39a07cf9..11463994a7d540b63de364b80d3077a5f51f6445 100644 (file)
@@ -131,12 +131,6 @@ struct task_struct *task_for_booting_cpu;
  */
 DEFINE_PER_CPU(int, cpu_state);
 
-/* Bitmasks of currently online, and possible CPUs */
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_possible_map);
-
 cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_core_map);
 DEFINE_PER_CPU_SHARED_ALIGNED(cpumask_t, cpu_sibling_map);
@@ -688,7 +682,7 @@ int migrate_platform_irqs(unsigned int cpu)
 {
        int new_cpei_cpu;
        irq_desc_t *desc = NULL;
-       cpumask_t       mask;
+       const struct cpumask *mask;
        int             retval = 0;
 
        /*
@@ -701,7 +695,7 @@ int migrate_platform_irqs(unsigned int cpu)
                         * Now re-target the CPEI to a different processor
                         */
                        new_cpei_cpu = any_online_cpu(cpu_online_map);
-                       mask = cpumask_of_cpu(new_cpei_cpu);
+                       mask = cpumask_of(new_cpei_cpu);
                        set_cpei_target_cpu(new_cpei_cpu);
                        desc = irq_desc + ia64_cpe_irq;
                        /*
index c75b914f2d6bb587c413990663e864f573c9368a..a8d61a3e9a948c2fbc3cd33b2621f75737ab89b0 100644 (file)
@@ -219,7 +219,7 @@ static ssize_t show_shared_cpu_map(struct cache_info *this_leaf, char *buf)
        cpumask_t shared_cpu_map;
 
        cpus_and(shared_cpu_map, this_leaf->shared_cpu_map, cpu_online_map);
-       len = cpumask_scnprintf(buf, NR_CPUS+1, shared_cpu_map);
+       len = cpumask_scnprintf(buf, NR_CPUS+1, &shared_cpu_map);
        len += sprintf(buf+len, "\n");
        return len;
 }
index 0c66dbdd1d728d0cd826b865a0d0ffe0cdd8b2cc..66fd705e82c09ee1e1905356cf447526461c086a 100644 (file)
@@ -227,14 +227,14 @@ finish_up:
        return new_irq_info;
 }
 
-static void sn_set_affinity_irq(unsigned int irq, cpumask_t mask)
+static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
        struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
        nasid_t nasid;
        int slice;
 
-       nasid = cpuid_to_nasid(first_cpu(mask));
-       slice = cpuid_to_slice(first_cpu(mask));
+       nasid = cpuid_to_nasid(cpumask_first(mask));
+       slice = cpuid_to_slice(cpumask_first(mask));
 
        list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
                                 sn_irq_lh[irq], list)
index 83f190ffe35078dd062266727c20025d00b3a7fa..ca553b0429ce34cfb00398701d32ab73c942eeaf 100644 (file)
@@ -151,7 +151,8 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
 }
 
 #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
+static void sn_set_msi_irq_affinity(unsigned int irq,
+                                   const struct cpumask *cpu_mask)
 {
        struct msi_msg msg;
        int slice;
@@ -164,7 +165,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
        struct sn_pcibus_provider *provider;
        unsigned int cpu;
 
-       cpu = first_cpu(cpu_mask);
+       cpu = cpumask_first(cpu_mask);
        sn_irq_info = sn_msi_info[irq].sn_irq_info;
        if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
                return;
@@ -204,7 +205,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq, cpumask_t cpu_mask)
        msg.address_lo = (u32)(bus_addr & 0x00000000ffffffff);
 
        write_msi_msg(irq, &msg);
-       irq_desc[irq].affinity = cpu_mask;
+       irq_desc[irq].affinity = *cpu_mask;
 }
 #endif /* CONFIG_SMP */
 
index dbaed4a638153bc43c5fa4d6e138039bb5cca8df..cabba332cc48322c99ee23eb1604ddd929f29d91 100644 (file)
@@ -10,6 +10,7 @@ config M32R
        default y
        select HAVE_IDE
        select HAVE_OPROFILE
+       select INIT_ALL_POSSIBLE
 
 config SBUS
        bool
@@ -273,7 +274,7 @@ config GENERIC_CALIBRATE_DELAY
        bool
        default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
         bool
         default y
 
index 39cb6da72dcbef3358d04e2b419a150f88c299ba..0f06b3722e960518bc275b1167b6948e5eb35914 100644 (file)
@@ -73,17 +73,11 @@ static unsigned int bsp_phys_id = -1;
 /* Bitmask of physically existing CPUs */
 physid_mask_t phys_cpu_present_map;
 
-/* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-
 cpumask_t cpu_bootout_map;
 cpumask_t cpu_bootin_map;
 static cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
 
 /* Per CPU bogomips and other parameters */
 struct cpuinfo_m32r cpu_data[NR_CPUS] __cacheline_aligned;
index c5b916700b222194e68e148d3ad13115bd41625e..2a12e7fa97484edcd46370ca65754cced3455bea 100644 (file)
@@ -156,7 +156,7 @@ void hw_timer_init(void)
 {
        u32 imr;
 
-       cf_pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+       cf_pit_clockevent.cpumask = cpumask_of(smp_processor_id());
        cf_pit_clockevent.mult = div_sc(FREQ, NSEC_PER_SEC, 32);
        cf_pit_clockevent.max_delta_ns =
                clockevent_delta2ns(0xFFFF, &cf_pit_clockevent);
index f4af967a6b3081173b9183d8e735d133382072c5..a5255e7c79e004001a8f55c82e33d964198b403f 100644 (file)
@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
        bool
        default y
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
        bool
        default y
 
index a58f0eecc68f91b5d5a0a1c2ce6d4617dc5ba42d..abc62aa744ace852888367ffce294fea809095f0 100644 (file)
@@ -49,7 +49,8 @@ static inline void smtc_im_ack_irq(unsigned int irq)
 #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
 #include <linux/cpumask.h>
 
-extern void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity);
+extern void plat_set_irq_affinity(unsigned int irq,
+                                 const struct cpumask *affinity);
 extern void smtc_forward_irq(unsigned int irq);
 
 /*
index 7785bec732f267254bae3477b83a885e968bd928..1fb959f98982c70c192e1f1fe731cc3c92f849b9 100644 (file)
@@ -37,7 +37,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
 
 /* sched_domains SD_NODE_INIT for SGI IP27 machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 0ff5b523ea77d8f6a5b7b2337969e414ce5ddaa8..86557b5d1b3f152932d9fe02744cad4eb4ec6316 100644 (file)
@@ -38,9 +38,6 @@ extern int __cpu_logical_map[NR_CPUS];
 #define SMP_RESCHEDULE_YOURSELF        0x1     /* XXX braindead */
 #define SMP_CALL_FUNCTION      0x2
 
-extern cpumask_t phys_cpu_present_map;
-#define cpu_possible_map       phys_cpu_present_map
-
 extern void asmlinkage smp_bootstrap(void);
 
 /*
index d7f8a782aae4d9a3200c307f383be28ed83b2b18..03965cb1b2527e23fd60a003284f51279b42a758 100644 (file)
@@ -146,7 +146,7 @@ void __init plat_time_init(void)
 
        BUG_ON(HZ != 100);
 
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        clockevents_register_device(cd);
        action->dev_id = cd;
        setup_irq(JAZZ_TIMER_IRQ, action);
index 0a57f86945f1e5be7b85553cc4839ba7348d2371..b820661678b0b948e7fb56e8332a28277df07a98 100644 (file)
@@ -126,7 +126,7 @@ void __cpuinit sb1480_clockevent_init(void)
        cd->min_delta_ns        = clockevent_delta2ns(2, cd);
        cd->rating              = 200;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = sibyte_next_event;
        cd->set_mode            = sibyte_set_mode;
        clockevents_register_device(cd);
@@ -148,6 +148,6 @@ void __cpuinit sb1480_clockevent_init(void)
        action->name    = name;
        action->dev_id  = cd;
 
-       irq_set_affinity(irq, cpumask_of_cpu(cpu));
+       irq_set_affinity(irq, cpumask_of(cpu));
        setup_irq(irq, action);
 }
index df4acb68bfb57be98fb38298826a677e15c6920e..1ada45ea07003d426cf591b2d639d45ffda0c53b 100644 (file)
@@ -88,7 +88,6 @@ static void ds1287_event_handler(struct clock_event_device *dev)
 static struct clock_event_device ds1287_clockevent = {
        .name           = "ds1287",
        .features       = CLOCK_EVT_FEAT_PERIODIC,
-       .cpumask        = CPU_MASK_CPU0,
        .set_next_event = ds1287_set_next_event,
        .set_mode       = ds1287_set_mode,
        .event_handler  = ds1287_event_handler,
@@ -122,6 +121,7 @@ int __init ds1287_clockevent_init(int irq)
        clockevent_set_clock(cd, 32768);
        cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
        cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+       cd->cpumask = cpumask_of(0);
 
        clockevents_register_device(&ds1287_clockevent);
 
index 6e2f58520afbdba949fb839ebfd66df78e4df364..e9b787feedcb7595fb763e057cfb18a06fb1ede1 100644 (file)
@@ -96,7 +96,6 @@ static void gt641xx_timer0_event_handler(struct clock_event_device *dev)
 static struct clock_event_device gt641xx_timer0_clockevent = {
        .name           = "gt641xx-timer0",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-       .cpumask        = CPU_MASK_CPU0,
        .irq            = GT641XX_TIMER0_IRQ,
        .set_next_event = gt641xx_timer0_set_next_event,
        .set_mode       = gt641xx_timer0_set_mode,
@@ -132,6 +131,7 @@ static int __init gt641xx_timer0_clockevent_init(void)
        clockevent_set_clock(cd, gt641xx_base_clock);
        cd->max_delta_ns = clockevent_delta2ns(0x7fffffff, cd);
        cd->min_delta_ns = clockevent_delta2ns(0x300, cd);
+       cd->cpumask = cpumask_of(0);
 
        clockevents_register_device(&gt641xx_timer0_clockevent);
 
index 4a4c59f2737a5d0bcdc8fc48f4f8e28ea217f528..e1ec83b68031b447ffc2e46f85f8519de6b56b7f 100644 (file)
@@ -195,7 +195,7 @@ int __cpuinit mips_clockevent_init(void)
 
        cd->rating              = 300;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = mips_next_event;
        cd->set_mode            = mips_set_clock_mode;
        cd->event_handler       = mips_event_handler;
index 63ac3ad462bc61cf2aeafe817a6ea1255c3991a0..a2eebaafda52c224ee385193bcb998984d9d790d 100644 (file)
@@ -125,7 +125,7 @@ void __cpuinit sb1250_clockevent_init(void)
        cd->min_delta_ns        = clockevent_delta2ns(2, cd);
        cd->rating              = 200;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = sibyte_next_event;
        cd->set_mode            = sibyte_set_mode;
        clockevents_register_device(cd);
@@ -147,6 +147,6 @@ void __cpuinit sb1250_clockevent_init(void)
        action->name    = name;
        action->dev_id  = cd;
 
-       irq_set_affinity(irq, cpumask_of_cpu(cpu));
+       irq_set_affinity(irq, cpumask_of(cpu));
        setup_irq(irq, action);
 }
index 5162fe4b59528ac9ae24fe894c6076f8af89c530..6d45e24db5bfce9a8f620f22cfdfabb0e662a342 100644 (file)
@@ -292,7 +292,7 @@ int __cpuinit mips_clockevent_init(void)
 
        cd->rating              = 300;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = mips_next_event;
        cd->set_mode            = mips_set_clock_mode;
        cd->event_handler       = mips_event_handler;
index b5fc4eb412d2e53b3544f0d67f032cecf4933095..eccf7d6096bd7b8cbdd9d2c4d2dd7ed2d4742a7a 100644 (file)
@@ -112,7 +112,6 @@ static struct clock_event_device txx9tmr_clock_event_device = {
        .name           = "TXx9",
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .rating         = 200,
-       .cpumask        = CPU_MASK_CPU0,
        .set_mode       = txx9tmr_set_mode,
        .set_next_event = txx9tmr_set_next_event,
 };
@@ -150,6 +149,7 @@ void __init txx9_clockevent_init(unsigned long baseaddr, int irq,
                clockevent_delta2ns(0xffffffff >> (32 - TXX9_TIMER_BITS), cd);
        cd->min_delta_ns = clockevent_delta2ns(0xf, cd);
        cd->irq = irq;
+       cd->cpumask = cpumask_of(0),
        clockevents_register_device(cd);
        setup_irq(irq, &txx9tmr_irq);
        printk(KERN_INFO "TXx9: clockevent device at 0x%lx, irq %d\n",
index b6ac55162b9a8e533063af37b73677fd6137b442..f4d187825f96406612d16a6016ab8defcf997658 100644 (file)
@@ -115,7 +115,7 @@ void __init setup_pit_timer(void)
         * Start pit with the boot cpu mask and make it global after the
         * IO_APIC has been initialized.
         */
-       cd->cpumask = cpumask_of_cpu(cpu);
+       cd->cpumask = cpumask_of(cpu);
        clockevent_set_clock(cd, CLOCK_TICK_RATE);
        cd->max_delta_ns = clockevent_delta2ns(0x7FFF, cd);
        cd->min_delta_ns = clockevent_delta2ns(0xF, cd);
index f0a4bb19e096ab7ddde894a44111fd5fca72b0b6..494a49a317e9b78fb137714423a64ef0661a411f 100644 (file)
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
 
 static DEFINE_SPINLOCK(gic_lock);
 
-static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
+static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        cpumask_t       tmp = CPU_MASK_NONE;
        unsigned long   flags;
@@ -164,7 +164,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
        pr_debug(KERN_DEBUG "%s called\n", __func__);
        irq -= _irqbase;
 
-       cpus_and(tmp, cpumask, cpu_online_map);
+       cpumask_and(&tmp, cpumask, cpu_online_mask);
        if (cpus_empty(tmp))
                return;
 
@@ -187,7 +187,7 @@ static void gic_set_affinity(unsigned int irq, cpumask_t cpumask)
                set_bit(irq, pcpu_masks[first_cpu(tmp)].pcpu_mask);
 
        }
-       irq_desc[irq].affinity = cpumask;
+       irq_desc[irq].affinity = *cpumask;
        spin_unlock_irqrestore(&gic_lock, flags);
 
 }
index ca476c4f62a57d12d839975334662254824a3b2f..f27beca4b26d22c67366b1be5d1f263637ad21c3 100644 (file)
@@ -51,10 +51,10 @@ static int __init allowcpus(char *str)
        int len;
 
        cpus_clear(cpu_allow_map);
-       if (cpulist_parse(str, cpu_allow_map) == 0) {
+       if (cpulist_parse(str, &cpu_allow_map) == 0) {
                cpu_set(0, cpu_allow_map);
                cpus_and(cpu_possible_map, cpu_possible_map, cpu_allow_map);
-               len = cpulist_scnprintf(buf, sizeof(buf)-1, cpu_possible_map);
+               len = cpulist_scnprintf(buf, sizeof(buf)-1, &cpu_possible_map);
                buf[len] = '\0';
                pr_debug("Allowable CPUs: %s\n", buf);
                return 1;
@@ -226,7 +226,7 @@ void __init cmp_smp_setup(void)
 
        for (i = 1; i < NR_CPUS; i++) {
                if (amon_cpu_avail(i)) {
-                       cpu_set(i, phys_cpu_present_map);
+                       cpu_set(i, cpu_possible_map);
                        __cpu_number_map[i]     = ++ncpu;
                        __cpu_logical_map[ncpu] = i;
                }
index 87a1816c1f4589cdb90c88896830b16b77b84f58..6f7ee5ac46ee5d5f1544967d3e1f3c94ea1fc7df 100644 (file)
@@ -70,7 +70,7 @@ static unsigned int __init smvp_vpe_init(unsigned int tc, unsigned int mvpconf0,
                write_vpe_c0_vpeconf0(tmp);
 
                /* Record this as available CPU */
-               cpu_set(tc, phys_cpu_present_map);
+               cpu_set(tc, cpu_possible_map);
                __cpu_number_map[tc]    = ++ncpu;
                __cpu_logical_map[ncpu] = tc;
        }
index 8bf88faf5afdcb5e7eca6923fa5f4d41f988880e..3da94704f81645027d246cf6aea5be80119eddfa 100644 (file)
 #include <asm/mipsmtregs.h>
 #endif /* CONFIG_MIPS_MT_SMTC */
 
-cpumask_t phys_cpu_present_map;                /* Bitmask of available CPUs */
 volatile cpumask_t cpu_callin_map;     /* Bitmask of started secondaries */
-cpumask_t cpu_online_map;              /* Bitmask of currently online CPUs */
 int __cpu_number_map[NR_CPUS];         /* Map physical to logical */
 int __cpu_logical_map[NR_CPUS];                /* Map logical to physical */
 
-EXPORT_SYMBOL(phys_cpu_present_map);
-EXPORT_SYMBOL(cpu_online_map);
-
 extern void cpu_idle(void);
 
 /* Number of TCs (or siblings in Intel speak) per CPU core */
@@ -195,7 +190,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 /* preload SMP state for boot cpu */
 void __devinit smp_prepare_boot_cpu(void)
 {
-       cpu_set(0, phys_cpu_present_map);
+       cpu_set(0, cpu_possible_map);
        cpu_set(0, cpu_online_map);
        cpu_set(0, cpu_callin_map);
 }
index 897fb2b4751c95715af0b798d79db5cf899a5e5d..b6cca01ff82b309e803105c1230957e943c87df8 100644 (file)
@@ -290,7 +290,7 @@ static void smtc_configure_tlb(void)
  * possibly leave some TCs/VPEs as "slave" processors.
  *
  * Use c0_MVPConf0 to find out how many TCs are available, setting up
- * phys_cpu_present_map and the logical/physical mappings.
+ * cpu_possible_map and the logical/physical mappings.
  */
 
 int __init smtc_build_cpu_map(int start_cpu_slot)
@@ -304,7 +304,7 @@ int __init smtc_build_cpu_map(int start_cpu_slot)
         */
        ntcs = ((read_c0_mvpconf0() & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1;
        for (i=start_cpu_slot; i<NR_CPUS && i<ntcs; i++) {
-               cpu_set(i, phys_cpu_present_map);
+               cpu_set(i, cpu_possible_map);
                __cpu_number_map[i] = i;
                __cpu_logical_map[i] = i;
        }
@@ -521,7 +521,7 @@ void smtc_prepare_cpus(int cpus)
         * Pull any physically present but unused TCs out of circulation.
         */
        while (tc < (((val & MVPCONF0_PTC) >> MVPCONF0_PTC_SHIFT) + 1)) {
-               cpu_clear(tc, phys_cpu_present_map);
+               cpu_clear(tc, cpu_possible_map);
                cpu_clear(tc, cpu_present_map);
                tc++;
        }
index f84a46a8ae6e5261d0c8669b2726ed353cda7dc3..aabd7274507b875e2726c246240d9e465a27bc9b 100644 (file)
@@ -114,9 +114,9 @@ struct plat_smp_ops msmtc_smp_ops = {
  */
 
 
-void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity)
+void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
-       cpumask_t tmask = affinity;
+       cpumask_t tmask = *affinity;
        int cpu = 0;
        void smtc_set_irq_affinity(unsigned int irq, cpumask_t aff);
 
@@ -139,7 +139,7 @@ void plat_set_irq_affinity(unsigned int irq, cpumask_t affinity)
         * be made to forward to an offline "CPU".
         */
 
-       for_each_cpu_mask(cpu, affinity) {
+       for_each_cpu(cpu, affinity) {
                if ((cpu_data[cpu].vpe_id != 0) || !cpu_online(cpu))
                        cpu_clear(cpu, tmask);
        }
index 62f495b57f937c5710fdea3eb0f94471003191ae..cf293b2790981566fd802a1f982ba52339560bde 100644 (file)
@@ -102,6 +102,7 @@ __init void plat_time_init(void)
        unsigned int p;
        unsigned int pow2p;
 
+       pnx8xxx_clockevent.cpumask = cpu_none_mask;
        clockevents_register_device(&pnx8xxx_clockevent);
        clocksource_register(&pnx_clocksource);
 
index 3a7df647ca7739b9336cc22ade77c34d883d5252..f78c29b68d77015e114195154c76671e2ce7d1a6 100644 (file)
@@ -141,7 +141,7 @@ static void __cpuinit yos_boot_secondary(int cpu, struct task_struct *idle)
 }
 
 /*
- * Detect available CPUs, populate phys_cpu_present_map before smp_init
+ * Detect available CPUs, populate cpu_possible_map before smp_init
  *
  * We don't want to start the secondary CPU yet nor do we have a nice probing
  * feature in PMON so we just assume presence of the secondary core.
@@ -150,10 +150,10 @@ static void __init yos_smp_setup(void)
 {
        int i;
 
-       cpus_clear(phys_cpu_present_map);
+       cpus_clear(cpu_possible_map);
 
        for (i = 0; i < 2; i++) {
-               cpu_set(i, phys_cpu_present_map);
+               cpu_set(i, cpu_possible_map);
                __cpu_number_map[i]     = i;
                __cpu_logical_map[i]    = i;
        }
index ba5cdebeaf0d23d703ce93a163cdd1b030a6bdf6..5b47d6b65275ba6835043e8a050958ba014d0ed2 100644 (file)
@@ -76,7 +76,7 @@ static int do_cpumask(cnodeid_t cnode, nasid_t nasid, int highest)
                        /* Only let it join in if it's marked enabled */
                        if ((acpu->cpu_info.flags & KLINFO_ENABLE) &&
                            (tot_cpus_found != NR_CPUS)) {
-                               cpu_set(cpuid, phys_cpu_present_map);
+                               cpu_set(cpuid, cpu_possible_map);
                                alloc_cpupda(cpuid, tot_cpus_found);
                                cpus_found++;
                                tot_cpus_found++;
index 1327c2746fb73be5cedc9484df1bfdb8ab16ce1c..f024057a35f8bab0e40997cd2bc0f30c2b39fb8a 100644 (file)
@@ -134,7 +134,7 @@ void __cpuinit hub_rt_clock_event_init(void)
        cd->min_delta_ns        = clockevent_delta2ns(0x300, cd);
        cd->rating              = 200;
        cd->irq                 = irq;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = rt_next_event;
        cd->set_mode            = rt_set_mode;
        clockevents_register_device(cd);
index a35818ed42639c8548537546d82eae9f35f23674..12b465d404dfc393e0996a248ff5e9a510aefe5e 100644 (file)
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
 static void disable_bcm1480_irq(unsigned int irq);
 static void ack_bcm1480_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask);
+static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask)
+static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        int i = 0, old_cpu, cpu, int_on, k;
        u64 cur_ints;
@@ -117,11 +117,11 @@ static void bcm1480_set_affinity(unsigned int irq, cpumask_t mask)
        unsigned long flags;
        unsigned int irq_dirty;
 
-       if (cpus_weight(mask) != 1) {
+       if (cpumask_weight(mask) != 1) {
                printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
                return;
        }
-       i = first_cpu(mask);
+       i = cpumask_first(mask);
 
        /* Convert logical CPU to physical CPU */
        cpu = cpu_logical_map(i);
index bd9eeb43ed0e1bd86a365322374ae9b9cfdfe25c..dddfda8e829461e8297ab44cfad6501e34641ca3 100644 (file)
@@ -136,7 +136,7 @@ static void __cpuinit bcm1480_boot_secondary(int cpu, struct task_struct *idle)
 
 /*
  * Use CFE to find out how many CPUs are available, setting up
- * phys_cpu_present_map and the logical/physical mappings.
+ * cpu_possible_map and the logical/physical mappings.
  * XXXKW will the boot CPU ever not be physical 0?
  *
  * Common setup before any secondaries are started
@@ -145,14 +145,14 @@ static void __init bcm1480_smp_setup(void)
 {
        int i, num;
 
-       cpus_clear(phys_cpu_present_map);
-       cpu_set(0, phys_cpu_present_map);
+       cpus_clear(cpu_possible_map);
+       cpu_set(0, cpu_possible_map);
        __cpu_number_map[0] = 0;
        __cpu_logical_map[0] = 0;
 
        for (i = 1, num = 0; i < NR_CPUS; i++) {
                if (cfe_cpu_stop(i) == 0) {
-                       cpu_set(i, phys_cpu_present_map);
+                       cpu_set(i, cpu_possible_map);
                        __cpu_number_map[i] = ++num;
                        __cpu_logical_map[num] = i;
                }
index a5158483986e878dd7d935a439d6c2b0c1f98d2e..808ac2959b8c497ff69f3bfe64090893ea57ca64 100644 (file)
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
 static void disable_sb1250_irq(unsigned int irq);
 static void ack_sb1250_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, cpumask_t mask);
+static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,16 +103,16 @@ void sb1250_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, cpumask_t mask)
+static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        int i = 0, old_cpu, cpu, int_on;
        u64 cur_ints;
        struct irq_desc *desc = irq_desc + irq;
        unsigned long flags;
 
-       i = first_cpu(mask);
+       i = cpumask_first(mask);
 
-       if (cpus_weight(mask) > 1) {
+       if (cpumask_weight(mask) > 1) {
                printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
                return;
        }
index 0734b933e969240054bd54e8f05f65608ba20f4c..5950a288a7daab17939664417fadacd68ca28d7b 100644 (file)
@@ -124,7 +124,7 @@ static void __cpuinit sb1250_boot_secondary(int cpu, struct task_struct *idle)
 
 /*
  * Use CFE to find out how many CPUs are available, setting up
- * phys_cpu_present_map and the logical/physical mappings.
+ * cpu_possible_map and the logical/physical mappings.
  * XXXKW will the boot CPU ever not be physical 0?
  *
  * Common setup before any secondaries are started
@@ -133,14 +133,14 @@ static void __init sb1250_smp_setup(void)
 {
        int i, num;
 
-       cpus_clear(phys_cpu_present_map);
-       cpu_set(0, phys_cpu_present_map);
+       cpus_clear(cpu_possible_map);
+       cpu_set(0, cpu_possible_map);
        __cpu_number_map[0] = 0;
        __cpu_logical_map[0] = 0;
 
        for (i = 1, num = 0; i < NR_CPUS; i++) {
                if (cfe_cpu_stop(i) == 0) {
-                       cpu_set(i, phys_cpu_present_map);
+                       cpu_set(i, cpu_possible_map);
                        __cpu_number_map[i] = ++num;
                        __cpu_logical_map[num] = i;
                }
index 796e3ce28720ace81719f4114382ded6df96ba79..69f5f88711cca8721cbc97ad4fda96e6682ecffd 100644 (file)
@@ -80,7 +80,7 @@ static void __init sni_a20r_timer_setup(void)
        struct irqaction *action = &a20r_irqaction;
        unsigned int cpu = smp_processor_id();
 
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        clockevents_register_device(cd);
        action->dev_id = cd;
        setup_irq(SNI_A20R_IRQ_TIMER, &a20r_irqaction);
index 644a70b1b04e4fd9ecf16d2e73d6e22ea5227115..aacf11d33723edc80aa3638ba5256e83d0c74bf2 100644 (file)
@@ -11,6 +11,7 @@ config PARISC
        select HAVE_OPROFILE
        select RTC_CLASS
        select RTC_DRV_PARISC
+       select INIT_ALL_POSSIBLE
        help
          The PA-RISC microprocessor is designed by Hewlett-Packard and used
          in many of their workstations & servers (HP9000 700 and 800 series,
index 23ef950df0080a11d0e8c6390d52d948c778ba99..4cea935e2f99d1d51436fdb815e357dea54a4e31 100644 (file)
@@ -131,12 +131,12 @@ int cpu_check_affinity(unsigned int irq, cpumask_t *dest)
        return 0;
 }
 
-static void cpu_set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
-       if (cpu_check_affinity(irq, &dest))
+       if (cpu_check_affinity(irq, dest))
                return;
 
-       irq_desc[irq].affinity = dest;
+       irq_desc[irq].affinity = *dest;
 }
 #endif
 
index d47f3975c9c6b683d4e0cfaf21dcf9156d68988d..80bc000523face7176458cf26c452f461ecd6413 100644 (file)
@@ -67,21 +67,6 @@ static volatile int cpu_now_booting __read_mostly = 0;       /* track which CPU is boo
 
 static int parisc_max_cpus __read_mostly = 1;
 
-/* online cpus are ones that we've managed to bring up completely
- * possible cpus are all valid cpu 
- * present cpus are all detected cpu
- *
- * On startup we bring up the "possible" cpus. Since we discover
- * CPUs later, we add them as hotplug, so the possible cpu mask is
- * empty in the beginning.
- */
-
-cpumask_t cpu_online_map   __read_mostly = CPU_MASK_NONE;      /* Bitmap of online CPUs */
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;       /* Bitmap of Present CPUs */
-
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(cpu_possible_map);
-
 DEFINE_PER_CPU(spinlock_t, ipi_lock) = SPIN_LOCK_UNLOCKED;
 
 enum ipi_message_type {
index 525c13a4de93ede9cd9451569d41cd20760b3636..adb23ea1c1ef1b2ed2965efb0d0a0da3cff429fe 100644 (file)
@@ -141,7 +141,7 @@ config GENERIC_NVRAM
        bool
        default y if PPC32
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
        bool
        default y
 
index b298f7a631e6cd9620094c72da4c82832afcdb43..e5f2ae8362f7ea8e15fcaca975fefa6e3662010b 100644 (file)
@@ -7,7 +7,19 @@
 
 #ifndef __ASSEMBLY__
 extern void _mcount(void);
-#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+       /* reloction of mcount call site is the same as the address */
+       return addr;
+}
+
+struct dyn_arch_ftrace {
+       struct module *mod;
+};
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
 
 #endif
 
index e5f14b13ccf0ff2c11ef8392b6697750608ca919..08454880a2c047258da91b4968f080f4e38fc4af 100644 (file)
@@ -34,11 +34,19 @@ struct mod_arch_specific {
 #ifdef __powerpc64__
        unsigned int stubs_section;     /* Index of stubs section in module */
        unsigned int toc_section;       /* What section is the TOC? */
-#else
+#ifdef CONFIG_DYNAMIC_FTRACE
+       unsigned long toc;
+       unsigned long tramp;
+#endif
+
+#else /* powerpc64 */
        /* Indices of PLT sections within module. */
        unsigned int core_plt_section;
        unsigned int init_plt_section;
+#ifdef CONFIG_DYNAMIC_FTRACE
+       unsigned long tramp;
 #endif
+#endif /* powerpc64 */
 
        /* List of BUG addresses, source line numbers and filenames */
        struct list_head bug_list;
@@ -68,6 +76,12 @@ struct mod_arch_specific {
 #    endif     /* MODULE */
 #endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+#    ifdef MODULE
+       asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous");
+#    endif     /* MODULE */
+#endif
+
 
 struct exception_table_entry;
 void sort_ex_table(struct exception_table_entry *start,
index c32da6f9799957a1ce3439b8f9e269bd887621f2..373fca394a54a1a1841d06d2ebec20440aa4ca7d 100644 (file)
@@ -48,7 +48,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
 
 /* sched_domains SD_NODE_INIT for PPC64 machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 92673b43858d067d6ccfab7a1cc083018d4a5c4f..d17edb4a2f9d57490cc8847b881b07fc3f06d7b9 100644 (file)
@@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
 CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
+CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
 
 ifdef CONFIG_DYNAMIC_FTRACE
 # dynamic ftrace setup.
index 7ecc0d1855c3342d341a8b00ea9cdbd9f51d1b42..6f7eb7e00c79eb3df9c6eba5ad61087e36427015 100644 (file)
@@ -1162,39 +1162,17 @@ machine_check_in_rtas:
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
-       stwu    r1,-48(r1)
-       stw     r3, 12(r1)
-       stw     r4, 16(r1)
-       stw     r5, 20(r1)
-       stw     r6, 24(r1)
-       mflr    r3
-       stw     r7, 28(r1)
-       mfcr    r5
-       stw     r8, 32(r1)
-       stw     r9, 36(r1)
-       stw     r10,40(r1)
-       stw     r3, 44(r1)
-       stw     r5, 8(r1)
-       subi    r3, r3, MCOUNT_INSN_SIZE
-       .globl mcount_call
-mcount_call:
-       bl      ftrace_stub
-       nop
-       lwz     r6, 8(r1)
-       lwz     r0, 44(r1)
-       lwz     r3, 12(r1)
+       /*
+        * It is required that _mcount on PPC32 must preserve the
+        * link register. But we have r0 to play with. We use r0
+        * to push the return address back to the caller of mcount
+        * into the ctr register, restore the link register and
+        * then jump back using the ctr register.
+        */
+       mflr    r0
        mtctr   r0
-       lwz     r4, 16(r1)
-       mtcr    r6
-       lwz     r5, 20(r1)
-       lwz     r6, 24(r1)
-       lwz     r0, 52(r1)
-       lwz     r7, 28(r1)
-       lwz     r8, 32(r1)
+       lwz     r0, 4(r1)
        mtlr    r0
-       lwz     r9, 36(r1)
-       lwz     r10,40(r1)
-       addi    r1, r1, 48
        bctr
 
 _GLOBAL(ftrace_caller)
index e0bcf93542867d79e20c1d5951397813535545a2..383ed6eb00850d2a7ab89b3b54b4677a3b9b1478 100644 (file)
@@ -894,18 +894,6 @@ _GLOBAL(enter_prom)
 #ifdef CONFIG_DYNAMIC_FTRACE
 _GLOBAL(mcount)
 _GLOBAL(_mcount)
-       /* Taken from output of objdump from lib64/glibc */
-       mflr    r3
-       stdu    r1, -112(r1)
-       std     r3, 128(r1)
-       subi    r3, r3, MCOUNT_INSN_SIZE
-       .globl mcount_call
-mcount_call:
-       bl      ftrace_stub
-       nop
-       ld      r0, 128(r1)
-       mtlr    r0
-       addi    r1, r1, 112
        blr
 
 _GLOBAL(ftrace_caller)
index f4b006ed0ab1ef183a0b0593520be182f4144c1f..5355244c99ff934abde18ce75336fe9a37689bcb 100644 (file)
@@ -9,22 +9,30 @@
 
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/list.h>
 
 #include <asm/cacheflush.h>
+#include <asm/code-patching.h>
 #include <asm/ftrace.h>
 
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(fmt , ...)      do { } while (0)
+#endif
 
-static unsigned int ftrace_nop = 0x60000000;
+static unsigned int ftrace_nop = PPC_NOP_INSTR;
 
 #ifdef CONFIG_PPC32
 # define GET_ADDR(addr) addr
 #else
 /* PowerPC64's functions are data that points to the functions */
-# define GET_ADDR(addr) *(unsigned long *)addr
+# define GET_ADDR(addr) (*(unsigned long *)addr)
 #endif
 
 
@@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr)
        return (int)(addr - ip);
 }
 
-unsigned char *ftrace_nop_replace(void)
+static unsigned char *ftrace_nop_replace(void)
 {
        return (char *)&ftrace_nop;
 }
 
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
        static unsigned int op;
 
@@ -68,49 +76,422 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 # define _ASM_PTR      " .long "
 #endif
 
-int
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
                   unsigned char *new_code)
 {
-       unsigned replaced;
-       unsigned old = *(unsigned *)old_code;
-       unsigned new = *(unsigned *)new_code;
-       int faulted = 0;
+       unsigned char replaced[MCOUNT_INSN_SIZE];
 
        /*
         * Note: Due to modules and __init, code can
         *  disappear and change, we need to protect against faulting
-        *  as well as code changing.
+        *  as well as code changing. We do this by using the
+        *  probe_kernel_* functions.
         *
         * No real locking needed, this code is run through
-        * kstop_machine.
+        * kstop_machine, or before SMP starts.
         */
-       asm volatile (
-               "1: lwz         %1, 0(%2)\n"
-               "   cmpw        %1, %5\n"
-               "   bne         2f\n"
-               "   stwu        %3, 0(%2)\n"
-               "2:\n"
-               ".section .fixup, \"ax\"\n"
-               "3:     li %0, 1\n"
-               "       b 2b\n"
-               ".previous\n"
-               ".section __ex_table,\"a\"\n"
-               _ASM_ALIGN "\n"
-               _ASM_PTR "1b, 3b\n"
-               ".previous"
-               : "=r"(faulted), "=r"(replaced)
-               : "r"(ip), "r"(new),
-                 "0"(faulted), "r"(old)
-               : "memory");
-
-       if (replaced != old && replaced != new)
-               faulted = 2;
-
-       if (!faulted)
-               flush_icache_range(ip, ip + 8);
-
-       return faulted;
+
+       /* read the text we want to modify */
+       if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       /* Make sure it is what we expect it to be */
+       if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+               return -EINVAL;
+
+       /* replace the text with the new text */
+       if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+
+/*
+ * Helper functions that are the same for both PPC64 and PPC32.
+ */
+static int test_24bit_addr(unsigned long ip, unsigned long addr)
+{
+
+       /* use the create_branch to verify that this offset can be branched */
+       return create_branch((unsigned int *)ip, addr, 0);
+}
+
+static int is_bl_op(unsigned int op)
+{
+       return (op & 0xfc000003) == 0x48000001;
+}
+
+static unsigned long find_bl_target(unsigned long ip, unsigned int op)
+{
+       static int offset;
+
+       offset = (op & 0x03fffffc);
+       /* make it signed */
+       if (offset & 0x02000000)
+               offset |= 0xfe000000;
+
+       return ip + (long)offset;
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_nop(struct module *mod,
+                 struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op;
+       unsigned int jmp[5];
+       unsigned long ptr;
+       unsigned long ip = rec->ip;
+       unsigned long tramp;
+       int offset;
+
+       /* read where this goes */
+       if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
+               return -EFAULT;
+
+       /* Make sure that that this is still a 24bit jump */
+       if (!is_bl_op(op)) {
+               printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+               return -EINVAL;
+       }
+
+       /* lets find where the pointer goes */
+       tramp = find_bl_target(ip, op);
+
+       /*
+        * On PPC64 the trampoline looks like:
+        * 0x3d, 0x82, 0x00, 0x00,    addis   r12,r2, <high>
+        * 0x39, 0x8c, 0x00, 0x00,    addi    r12,r12, <low>
+        *   Where the bytes 2,3,6 and 7 make up the 32bit offset
+        *   to the TOC that holds the pointer.
+        *   to jump to.
+        * 0xf8, 0x41, 0x00, 0x28,    std     r2,40(r1)
+        * 0xe9, 0x6c, 0x00, 0x20,    ld      r11,32(r12)
+        *   The actually address is 32 bytes from the offset
+        *   into the TOC.
+        * 0xe8, 0x4c, 0x00, 0x28,    ld      r2,40(r12)
+        */
+
+       DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
+
+       /* Find where the trampoline jumps to */
+       if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+               printk(KERN_ERR "Failed to read %lx\n", tramp);
+               return -EFAULT;
+       }
+
+       DEBUGP(" %08x %08x", jmp[0], jmp[1]);
+
+       /* verify that this is what we expect it to be */
+       if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
+           ((jmp[1] & 0xffff0000) != 0x398c0000) ||
+           (jmp[2] != 0xf8410028) ||
+           (jmp[3] != 0xe96c0020) ||
+           (jmp[4] != 0xe84c0028)) {
+               printk(KERN_ERR "Not a trampoline\n");
+               return -EINVAL;
+       }
+
+       offset = (unsigned)((unsigned short)jmp[0]) << 16 |
+               (unsigned)((unsigned short)jmp[1]);
+
+       DEBUGP(" %x ", offset);
+
+       /* get the address this jumps too */
+       tramp = mod->arch.toc + offset + 32;
+       DEBUGP("toc: %lx", tramp);
+
+       if (probe_kernel_read(jmp, (void *)tramp, 8)) {
+               printk(KERN_ERR "Failed to read %lx\n", tramp);
+               return -EFAULT;
+       }
+
+       DEBUGP(" %08x %08x\n", jmp[0], jmp[1]);
+
+       ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
+
+       /* This should match what was called */
+       if (ptr != GET_ADDR(addr)) {
+               printk(KERN_ERR "addr does not match %lx\n", ptr);
+               return -EINVAL;
+       }
+
+       /*
+        * We want to nop the line, but the next line is
+        *  0xe8, 0x41, 0x00, 0x28   ld r2,40(r1)
+        * This needs to be turned to a nop too.
+        */
+       if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       if (op != 0xe8410028) {
+               printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
+               return -EINVAL;
+       }
+
+       /*
+        * Milton Miller pointed out that we can not blindly do nops.
+        * If a task was preempted when calling a trace function,
+        * the nops will remove the way to restore the TOC in r2
+        * and the r2 TOC will get corrupted.
+        */
+
+       /*
+        * Replace:
+        *   bl <tramp>  <==== will be replaced with "b 1f"
+        *   ld r2,40(r1)
+        *  1:
+        */
+       op = 0x48000008;        /* b +8 */
+
+       if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+
+#else /* !PPC64 */
+static int
+__ftrace_make_nop(struct module *mod,
+                 struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op;
+       unsigned int jmp[4];
+       unsigned long ip = rec->ip;
+       unsigned long tramp;
+
+       if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       /* Make sure that that this is still a 24bit jump */
+       if (!is_bl_op(op)) {
+               printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
+               return -EINVAL;
+       }
+
+       /* lets find where the pointer goes */
+       tramp = find_bl_target(ip, op);
+
+       /*
+        * On PPC32 the trampoline looks like:
+        *  0x3d, 0x60, 0x00, 0x00  lis r11,sym@ha
+        *  0x39, 0x6b, 0x00, 0x00  addi r11,r11,sym@l
+        *  0x7d, 0x69, 0x03, 0xa6  mtctr r11
+        *  0x4e, 0x80, 0x04, 0x20  bctr
+        */
+
+       DEBUGP("ip:%lx jumps to %lx", ip, tramp);
+
+       /* Find where the trampoline jumps to */
+       if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
+               printk(KERN_ERR "Failed to read %lx\n", tramp);
+               return -EFAULT;
+       }
+
+       DEBUGP(" %08x %08x ", jmp[0], jmp[1]);
+
+       /* verify that this is what we expect it to be */
+       if (((jmp[0] & 0xffff0000) != 0x3d600000) ||
+           ((jmp[1] & 0xffff0000) != 0x396b0000) ||
+           (jmp[2] != 0x7d6903a6) ||
+           (jmp[3] != 0x4e800420)) {
+               printk(KERN_ERR "Not a trampoline\n");
+               return -EINVAL;
+       }
+
+       tramp = (jmp[1] & 0xffff) |
+               ((jmp[0] & 0xffff) << 16);
+       if (tramp & 0x8000)
+               tramp -= 0x10000;
+
+       DEBUGP(" %x ", tramp);
+
+       if (tramp != addr) {
+               printk(KERN_ERR
+                      "Trampoline location %08lx does not match addr\n",
+                      tramp);
+               return -EINVAL;
+       }
+
+       op = PPC_NOP_INSTR;
+
+       if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+#endif /* PPC64 */
+
+int ftrace_make_nop(struct module *mod,
+                   struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *old, *new;
+       unsigned long ip = rec->ip;
+
+       /*
+        * If the calling address is more that 24 bits away,
+        * then we had to use a trampoline to make the call.
+        * Otherwise just update the call site.
+        */
+       if (test_24bit_addr(ip, addr)) {
+               /* within range */
+               old = ftrace_call_replace(ip, addr);
+               new = ftrace_nop_replace();
+               return ftrace_modify_code(ip, old, new);
+       }
+
+       /*
+        * Out of range jumps are called from modules.
+        * We should either already have a pointer to the module
+        * or it has been passed in.
+        */
+       if (!rec->arch.mod) {
+               if (!mod) {
+                       printk(KERN_ERR "No module loaded addr=%lx\n",
+                              addr);
+                       return -EFAULT;
+               }
+               rec->arch.mod = mod;
+       } else if (mod) {
+               if (mod != rec->arch.mod) {
+                       printk(KERN_ERR
+                              "Record mod %p not equal to passed in mod %p\n",
+                              rec->arch.mod, mod);
+                       return -EINVAL;
+               }
+               /* nothing to do if mod == rec->arch.mod */
+       } else
+               mod = rec->arch.mod;
+
+       return __ftrace_make_nop(mod, rec, addr);
+
+}
+
+#ifdef CONFIG_PPC64
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op[2];
+       unsigned long ip = rec->ip;
+
+       /* read where this goes */
+       if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
+               return -EFAULT;
+
+       /*
+        * It should be pointing to two nops or
+        *  b +8; ld r2,40(r1)
+        */
+       if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
+           ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) {
+               printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
+               return -EINVAL;
+       }
+
+       /* If we never set up a trampoline to ftrace_caller, then bail */
+       if (!rec->arch.mod->arch.tramp) {
+               printk(KERN_ERR "No ftrace trampoline\n");
+               return -EINVAL;
+       }
+
+       /* create the branch to the trampoline */
+       op[0] = create_branch((unsigned int *)ip,
+                             rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+       if (!op[0]) {
+               printk(KERN_ERR "REL24 out of range!\n");
+               return -EINVAL;
+       }
+
+       /* ld r2,40(r1) */
+       op[1] = 0xe8410028;
+
+       DEBUGP("write to %lx\n", rec->ip);
+
+       if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+#else
+static int
+__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned int op;
+       unsigned long ip = rec->ip;
+
+       /* read where this goes */
+       if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       /* It should be pointing to a nop */
+       if (op != PPC_NOP_INSTR) {
+               printk(KERN_ERR "Expected NOP but have %x\n", op);
+               return -EINVAL;
+       }
+
+       /* If we never set up a trampoline to ftrace_caller, then bail */
+       if (!rec->arch.mod->arch.tramp) {
+               printk(KERN_ERR "No ftrace trampoline\n");
+               return -EINVAL;
+       }
+
+       /* create the branch to the trampoline */
+       op = create_branch((unsigned int *)ip,
+                          rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
+       if (!op) {
+               printk(KERN_ERR "REL24 out of range!\n");
+               return -EINVAL;
+       }
+
+       DEBUGP("write to %lx\n", rec->ip);
+
+       if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
+               return -EPERM;
+
+       flush_icache_range(ip, ip + 8);
+
+       return 0;
+}
+#endif /* CONFIG_PPC64 */
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *old, *new;
+       unsigned long ip = rec->ip;
+
+       /*
+        * If the calling address is more that 24 bits away,
+        * then we had to use a trampoline to make the call.
+        * Otherwise just update the call site.
+        */
+       if (test_24bit_addr(ip, addr)) {
+               /* within range */
+               old = ftrace_nop_replace();
+               new = ftrace_call_replace(ip, addr);
+               return ftrace_modify_code(ip, old, new);
+       }
+
+       /*
+        * Out of range jumps are called from modules.
+        * Being that we are converting from nop, it had better
+        * already have a module defined.
+        */
+       if (!rec->arch.mod) {
+               printk(KERN_ERR "No module loaded\n");
+               return -EINVAL;
+       }
+
+       return __ftrace_make_call(rec, addr);
 }
 
 int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -128,10 +509,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-       /* This is running in kstop_machine */
+       /* caller expects data to be zero */
+       unsigned long *p = data;
 
-       ftrace_mcount_set(data);
+       *p = 0;
 
        return 0;
 }
-
index 31982d05d81a8814d73dfc8b5a562cd90b66e5d3..88d9c1d5e5fb0aea1e74757ae9ac1133e9f352ef 100644 (file)
@@ -69,10 +69,15 @@ void cpu_idle(void)
                                smp_mb();
                                local_irq_disable();
 
+                               /* Don't trace irqs off for idle */
+                               stop_critical_timings();
+
                                /* check again after disabling irqs */
                                if (!need_resched() && !cpu_should_die())
                                        ppc_md.power_save();
 
+                               start_critical_timings();
+
                                local_irq_enable();
                                set_thread_flag(TIF_POLLING_NRFLAG);
 
index ac222d0ab12e86406bcbc6247adb3b8150ed49ee..23b8b5e36f98b9afb5cf970e20b4a77ab5785627 100644 (file)
@@ -237,7 +237,7 @@ void fixup_irqs(cpumask_t map)
                        mask = map;
                }
                if (irq_desc[irq].chip->set_affinity)
-                       irq_desc[irq].chip->set_affinity(irq, mask);
+                       irq_desc[irq].chip->set_affinity(irq, &mask);
                else if (irq_desc[irq].action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
index 2df91a03462a26109d49b9fc82b852df76c14b34..f832773fc28e940f82ed0dcecc6104d5e6e71939 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/ftrace.h>
 #include <linux/cache.h>
 #include <linux/bug.h>
 #include <linux/sort.h>
@@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
                        r_addend = rela[i].r_addend;
                }
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+       _count_relocs++;        /* add one for ftrace_caller */
+#endif
        return _count_relocs;
 }
 
@@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
                        return -ENOEXEC;
                }
        }
+#ifdef CONFIG_DYNAMIC_FTRACE
+       module->arch.tramp =
+               do_plt_call(module->module_core,
+                           (unsigned long)ftrace_caller,
+                           sechdrs, module);
+#endif
        return 0;
 }
index 1af2377e49929dc367d49b4588fd2ba6d23d7e91..8992b031a7b6a6cc54433453b6b991d1610cb5d6 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/moduleloader.h>
 #include <linux/err.h>
 #include <linux/vmalloc.h>
+#include <linux/ftrace.h>
 #include <linux/bug.h>
 #include <asm/module.h>
 #include <asm/firmware.h>
@@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
                }
        }
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+       /* make the trampoline to the ftrace_caller */
+       relocs++;
+#endif
+
        DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
        return relocs * sizeof(struct ppc64_stub_entry);
 }
@@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
                }
        }
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+       me->arch.toc = my_r2(sechdrs, me);
+       me->arch.tramp = stub_for_addr(sechdrs,
+                                      (unsigned long)ftrace_caller,
+                                      me);
+#endif
+
        return 0;
 }
index ff9f7010097d1568943329a15d8c04458beddc4c..d1165566f06487a6860e8d14e3586f7230d6b66d 100644 (file)
 int smp_hw_index[NR_CPUS];
 struct thread_info *secondary_ti;
 
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-cpumask_t cpu_online_map = CPU_MASK_NONE;
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
 DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE;
 
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(cpu_possible_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 
index e2ee66b5831d9f86cf41704784863cd74bbc3c91..6f39d35d6f556f51fa8db03a3fd43a683f0da798 100644 (file)
@@ -869,7 +869,7 @@ static void register_decrementer_clockevent(int cpu)
        struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
 
        *dec = decrementer_clockevent;
-       dec->cpumask = cpumask_of_cpu(cpu);
+       dec->cpumask = cpumask_of(cpu);
 
        printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
               dec->name, dec->mult, dec->shift, cpu);
index d69912c07ce732c937412dac1286ba0223ded96c..8db35278a4b43643c2cf42b280b79ad9d33be7bc 100644 (file)
@@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y)
 EXTRA_CFLAGS           += -mno-minimal-toc
 endif
 
+CFLAGS_REMOVE_code-patching.o = -pg
+CFLAGS_REMOVE_feature-fixups.o = -pg
+
 obj-y                  := string.o alloc.o \
                           checksum_$(CONFIG_WORD_SIZE).o
 obj-$(CONFIG_PPC32)    += div64.o copy_32.o crtsavres.o
index e1904774a70fecd21673f6e899d89251905f91d3..424b335a71c83e360b59bfdebd7939d62070b360 100644 (file)
@@ -332,7 +332,7 @@ static void xics_eoi_lpar(unsigned int virq)
        lpar_xirr_info_set((0xff << 24) | irq);
 }
 
-static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 {
        unsigned int irq;
        int status;
@@ -845,7 +845,7 @@ void xics_migrate_irqs_away(void)
 
                /* Reset affinity to all cpus */
                irq_desc[virq].affinity = CPU_MASK_ALL;
-               desc->chip->set_affinity(virq, CPU_MASK_ALL);
+               desc->chip->set_affinity(virq, cpu_all_mask);
 unlock:
                spin_unlock_irqrestore(&desc->lock, flags);
        }
index 1890fb085cded9bdcc66c61214a4058b579c98e6..5d7f9f0c93c323ba6955203496c61d7979fa7ff9 100644 (file)
@@ -817,7 +817,7 @@ static void mpic_end_ipi(unsigned int irq)
 
 #endif /* CONFIG_SMP */
 
-void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
+void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        struct mpic *mpic = mpic_from_irq(irq);
        unsigned int src = mpic_irq_to_hw(irq);
@@ -829,7 +829,7 @@ void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
        } else {
                cpumask_t tmp;
 
-               cpus_and(tmp, cpumask, cpu_online_map);
+               cpumask_and(&tmp, cpumask, cpu_online_mask);
 
                mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
                               mpic_physmask(cpus_addr(tmp)[0]));
index 6209c62a426d1177cb574a1bf92f0dc6f7ed7b43..3cef2af10f4254daea1ff69769117ffb1f8a23b3 100644 (file)
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
 
 extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
 extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 #endif /* _POWERPC_SYSDEV_MPIC_H */
index 8116a3328a19dd9b0cc79bbc3cd5b32a10e0234a..b4aa5869c7f9c5b9eef1ce41bd56d0b7eeb4677b 100644 (file)
@@ -75,6 +75,7 @@ config S390
        select HAVE_KRETPROBES
        select HAVE_KVM if 64BIT
        select HAVE_ARCH_TRACEHOOK
+       select INIT_ALL_POSSIBLE
 
 source "init/Kconfig"
 
index b5595688a47777d72eb803dd00fd8a15b35dee66..f03914b8ed2f85942416266fa15758a20bb0b5eb 100644 (file)
 struct _lowcore *lowcore_ptr[NR_CPUS];
 EXPORT_SYMBOL(lowcore_ptr);
 
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
-
-cpumask_t cpu_possible_map = CPU_MASK_ALL;
-EXPORT_SYMBOL(cpu_possible_map);
-
 static struct task_struct *current_set[NR_CPUS];
 
 static u8 smp_cpu_type;
index eccefbbff887dc199b04ec715d25ec9c2ec1201c..f5bd141c84434b7a5376b134eeee75a684507575 100644 (file)
@@ -154,7 +154,7 @@ void init_cpu_timer(void)
        cd->min_delta_ns        = 1;
        cd->max_delta_ns        = LONG_MAX;
        cd->rating              = 400;
-       cd->cpumask             = cpumask_of_cpu(cpu);
+       cd->cpumask             = cpumask_of(cpu);
        cd->set_next_event      = s390_next_event;
        cd->set_mode            = s390_set_mode;
 
index a947899dcba1d55f56f8e00af05ab738293df6a3..bf96f1b5c6ec20b2539c8c2a1cf4ce46ee1e54ec 100644 (file)
@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
                cpu_core_map[cpu] = cpu_coregroup_map(cpu);
 }
 
-void arch_update_cpu_topology(void)
+int arch_update_cpu_topology(void)
 {
        struct tl_info *info = tl_info;
        struct sys_device *sysdev;
@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
        if (!machine_has_topology) {
                update_cpu_core_map();
                topology_update_polarization_simple();
-               return;
+               return 0;
        }
        stsi(info, 15, 1, 2);
        tl_to_cores(info);
@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
                sysdev = get_cpu_sysdev(cpu);
                kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
        }
+       return 1;
 }
 
 static void topology_work_fn(struct work_struct *work)
index 85b660c17eb070e230b7ce3e0dcac1c7f7ab27c8..c24e9c6a173661a307e2b2f90b9dc3ea961551ed 100644 (file)
@@ -31,7 +31,7 @@ enum {
 };
 
 void smp_message_recv(unsigned int msg);
-void smp_timer_broadcast(cpumask_t mask);
+void smp_timer_broadcast(const struct cpumask *mask);
 
 void local_timer_interrupt(void);
 void local_timer_setup(unsigned int cpu);
index 95f0085e098a6859d746581b3b4b616439062854..279d9cc4a007672a403dcfce9eb4d8fe5ab25114 100644 (file)
@@ -5,7 +5,6 @@
 
 /* sched_domains SD_NODE_INIT for sh machines */
 #define SD_NODE_INIT (struct sched_domain) {           \
-       .span                   = CPU_MASK_NONE,        \
        .parent                 = NULL,                 \
        .child                  = NULL,                 \
        .groups                 = NULL,                 \
index 3c5ad1660bbcdd6b0366934167b3dc879cdb026d..8f40274126142847f99e7c51f71e13a22813b2cf 100644 (file)
 int __cpu_number_map[NR_CPUS];         /* Map physical to logical */
 int __cpu_logical_map[NR_CPUS];                /* Map logical to physical */
 
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-
-cpumask_t cpu_online_map;
-EXPORT_SYMBOL(cpu_online_map);
-
 static inline void __init smp_store_cpu_info(unsigned int cpu)
 {
        struct sh_cpuinfo *c = cpu_data + cpu;
@@ -190,11 +184,11 @@ void arch_send_call_function_single_ipi(int cpu)
        plat_send_ipi(cpu, SMP_MSG_FUNCTION_SINGLE);
 }
 
-void smp_timer_broadcast(cpumask_t mask)
+void smp_timer_broadcast(const struct cpumask *mask)
 {
        int cpu;
 
-       for_each_cpu_mask(cpu, mask)
+       for_each_cpu(cpu, mask)
                plat_send_ipi(cpu, SMP_MSG_TIMER);
 }
 
index c2317635230fda90f2b97ed2c7903b154a2ab175..96e8eaea1e62003603fd2343a09277022fb518a5 100644 (file)
@@ -51,7 +51,7 @@ void __cpuinit local_timer_setup(unsigned int cpu)
        clk->mult               = 1;
        clk->set_mode           = dummy_timer_set_mode;
        clk->broadcast          = smp_timer_broadcast;
-       clk->cpumask            = cpumask_of_cpu(cpu);
+       clk->cpumask            = cpumask_of(cpu);
 
        clockevents_register_device(clk);
 }
index 3c61ddd4d43e0459c3cd4355133a486f328bea50..0db3f95103368fa4c236e4666b38c828d25ac3a2 100644 (file)
@@ -263,7 +263,7 @@ static int tmu_timer_init(void)
        tmu0_clockevent.min_delta_ns =
                        clockevent_delta2ns(1, &tmu0_clockevent);
 
-       tmu0_clockevent.cpumask = cpumask_of_cpu(0);
+       tmu0_clockevent.cpumask = cpumask_of(0);
 
        clockevents_register_device(&tmu0_clockevent);
 
index a8180e546a48f87a6d5b9a7123d9ed91a3b51a56..8408d9d2a662264c0f71184c22fd62f4dc2adb92 100644 (file)
@@ -29,8 +29,6 @@
  */
 
 extern unsigned char boot_cpu_id;
-extern cpumask_t phys_cpu_present_map;
-#define cpu_possible_map phys_cpu_present_map
 
 typedef void (*smpfunc_t)(unsigned long, unsigned long, unsigned long,
                       unsigned long, unsigned long);
index e396c1f17a922deaef7b03a0751d06f1345770fd..1e5ac4e282e1285030aaa43380b91c01b7bcff48 100644 (file)
@@ -39,8 +39,6 @@ volatile unsigned long cpu_callin_map[NR_CPUS] __cpuinitdata = {0,};
 unsigned char boot_cpu_id = 0;
 unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */
 
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
 cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 
 /* The only guaranteed locking primitive available on all Sparc
@@ -334,7 +332,7 @@ void __init smp_setup_cpu_possible_map(void)
        instance = 0;
        while (!cpu_find_by_instance(instance, NULL, &mid)) {
                if (mid < NR_CPUS) {
-                       cpu_set(mid, phys_cpu_present_map);
+                       cpu_set(mid, cpu_possible_map);
                        cpu_set(mid, cpu_present_map);
                }
                instance++;
@@ -354,7 +352,7 @@ void __init smp_prepare_boot_cpu(void)
 
        current_thread_info()->cpu = cpuid;
        cpu_set(cpuid, cpu_online_map);
-       cpu_set(cpuid, phys_cpu_present_map);
+       cpu_set(cpuid, cpu_possible_map);
 }
 
 int __cpuinit __cpu_up(unsigned int cpu)
index b0dfff84865365a595e39283e480b1ccbd9eb850..32d11a5fe3a86f9e5f60acfefff09a2dd0d98c41 100644 (file)
@@ -113,10 +113,6 @@ EXPORT_PER_CPU_SYMBOL(__cpu_data);
 #ifdef CONFIG_SMP
 /* IRQ implementation. */
 EXPORT_SYMBOL(synchronize_irq);
-
-/* CPU online map and active count. */
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(phys_cpu_present_map);
 #endif
 
 EXPORT_SYMBOL(__udelay);
index 52fc836f464d979655c330b43b15736fc6c840a2..4aaf18e83c8c24766c4a18ae5a3716f7a6249402 100644 (file)
@@ -312,7 +312,8 @@ static void sun4u_irq_enable(unsigned int virt_irq)
        }
 }
 
-static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4u_set_affinity(unsigned int virt_irq,
+                              const struct cpumask *mask)
 {
        sun4u_irq_enable(virt_irq);
 }
@@ -362,7 +363,8 @@ static void sun4v_irq_enable(unsigned int virt_irq)
                       ino, err);
 }
 
-static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4v_set_affinity(unsigned int virt_irq,
+                              const struct cpumask *mask)
 {
        unsigned int ino = virt_irq_table[virt_irq].dev_ino;
        unsigned long cpuid = irq_choose_cpu(virt_irq);
@@ -429,7 +431,8 @@ static void sun4v_virq_enable(unsigned int virt_irq)
                       dev_handle, dev_ino, err);
 }
 
-static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask)
+static void sun4v_virt_set_affinity(unsigned int virt_irq,
+                                   const struct cpumask *mask)
 {
        unsigned long cpuid, dev_handle, dev_ino;
        int err;
@@ -788,7 +791,7 @@ void fixup_irqs(void)
                    !(irq_desc[irq].status & IRQ_PER_CPU)) {
                        if (irq_desc[irq].chip->set_affinity)
                                irq_desc[irq].chip->set_affinity(irq,
-                                       irq_desc[irq].affinity);
+                                       &irq_desc[irq].affinity);
                }
                spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
        }
index 0f616ae3246c5a9bc4be334f52c2879fead932f2..df2efb7fc14c864b37fb0273adaadb7be4cf975e 100644 (file)
@@ -780,7 +780,7 @@ out:
        if (nid != -1) {
                cpumask_t numa_mask = node_to_cpumask(nid);
 
-               irq_set_affinity(irq, numa_mask);
+               irq_set_affinity(irq, &numa_mask);
        }
 
        return irq;
index 2e680f34f727fa61f5defef92e63b8144e365d70..0d0cd815e83e505b24e4b12a14f276fa37be892a 100644 (file)
@@ -288,7 +288,7 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm,
        if (nid != -1) {
                cpumask_t numa_mask = node_to_cpumask(nid);
 
-               irq_set_affinity(irq, numa_mask);
+               irq_set_affinity(irq, &numa_mask);
        }
        err = request_irq(irq, sparc64_msiq_interrupt, 0,
                          "MSIQ",
index f500b0618bb0d3b4badcdee2c5c8cb20225c97e9..a97b8822c22ca029acc8ea1fe0fc6cc90c88efd0 100644 (file)
 
 int sparc64_multi_core __read_mostly;
 
-cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE;
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
        { [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 
-EXPORT_SYMBOL(cpu_possible_map);
-EXPORT_SYMBOL(cpu_online_map);
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_SYMBOL(cpu_core_map);
 
index 141da375909129dea0ab0fffc9d5359d85e9a039..9df8f095a8b11a59e448bec001ac6aec852c480c 100644 (file)
@@ -763,7 +763,7 @@ void __devinit setup_sparc64_timer(void)
        sevt = &__get_cpu_var(sparc64_events);
 
        memcpy(sevt, &sparc64_clockevent, sizeof(*sevt));
-       sevt->cpumask = cpumask_of_cpu(smp_processor_id());
+       sevt->cpumask = cpumask_of(smp_processor_id());
 
        clockevents_register_device(sevt);
 }
index 753346e2cdfd64d5d472f3388014952f5d0ca4ff..ae5f94d6317d584c051fbcea83533431e4b635d7 100644 (file)
@@ -11,21 +11,21 @@ extern int get_signals(void);
 extern void block_signals(void);
 extern void unblock_signals(void);
 
-#define local_save_flags(flags) do { typecheck(unsigned long, flags); \
+#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
                                     (flags) = get_signals(); } while(0)
-#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \
+#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
                                      set_signals(flags); } while(0)
 
-#define local_irq_save(flags) do { local_save_flags(flags); \
-                                   local_irq_disable(); } while(0)
+#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
+                                   raw_local_irq_disable(); } while(0)
 
-#define local_irq_enable() unblock_signals()
-#define local_irq_disable() block_signals()
+#define raw_local_irq_enable() unblock_signals()
+#define raw_local_irq_disable() block_signals()
 
 #define irqs_disabled()                 \
 ({                                      \
         unsigned long flags;            \
-        local_save_flags(flags);        \
+        raw_local_save_flags(flags);        \
         (flags == 0);                   \
 })
 
index 045772142844690f2471d9330b836da594c945f8..98351c78bc814de7062dc5e9c213810ee4811de5 100644 (file)
@@ -25,13 +25,6 @@ DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 #include "irq_user.h"
 #include "os.h"
 
-/* CPU online map, set by smp_boot_cpus */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(cpu_possible_map);
-
 /* Per CPU bogomips and other parameters
  * The only piece used here is the ipi pipe, which is set before SMP is
  * started and never changed.
index 47f04f4a3464a918850b67573e1a67a7cf5c3dec..b13a87a3ec95cf7754fe0216660a2695e40b4d1d 100644 (file)
@@ -50,7 +50,7 @@ static int itimer_next_event(unsigned long delta,
 static struct clock_event_device itimer_clockevent = {
        .name           = "itimer",
        .rating         = 250,
-       .cpumask        = CPU_MASK_ALL,
+       .cpumask        = cpu_all_mask,
        .features       = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
        .set_mode       = itimer_set_mode,
        .set_next_event = itimer_next_event,
index ac22bb7719f730e6b8d12b305ec9e08952a513a4..0ca2eb7573cd21a2d7298371345d2c32bbc0b0b4 100644 (file)
@@ -29,11 +29,14 @@ config X86
        select HAVE_FTRACE_MCOUNT_RECORD
        select HAVE_DYNAMIC_FTRACE
        select HAVE_FUNCTION_TRACER
+       select HAVE_FUNCTION_GRAPH_TRACER
+       select HAVE_FUNCTION_TRACE_MCOUNT_TEST
        select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
        select HAVE_ARCH_KGDB if !X86_VOYAGER
        select HAVE_ARCH_TRACEHOOK
        select HAVE_GENERIC_DMA_COHERENT if X86_32
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
+       select USER_STACKTRACE_SUPPORT
 
 config ARCH_DEFCONFIG
        string
@@ -238,6 +241,25 @@ config X86_HAS_BOOT_CPU_ID
        def_bool y
        depends on X86_VOYAGER
 
+config SPARSE_IRQ
+       bool "Support sparse irq numbering"
+       depends on PCI_MSI || HT_IRQ
+       default y
+       help
+         This enables support for sparse irq, esp for msi/msi-x. You may need
+         if you have lots of cards supports msi-x installed.
+
+         If you don't know what to do here, say Y.
+
+config NUMA_MIGRATE_IRQ_DESC
+       bool "Move irq desc when changing irq smp_affinity"
+       depends on SPARSE_IRQ && SMP
+       default n
+       help
+         This enables moving irq_desc to cpu/node that irq will use handled.
+
+         If you don't know what to do here, say N.
+
 config X86_FIND_SMP_CONFIG
        def_bool y
        depends on X86_MPPARSE || X86_VOYAGER
@@ -367,10 +389,10 @@ config X86_RDC321X
          as R-8610-(G).
          If you don't have one of these chips, you should say N here.
 
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
        def_bool y
        prompt "Single-depth WCHAN output"
-       depends on X86_32
+       depends on X86
        help
          Calculate simpler /proc/<PID>/wchan values. If this option
          is disabled then wchan values will recurse back to the
@@ -465,10 +487,6 @@ config X86_CYCLONE_TIMER
        def_bool y
        depends on X86_GENERICARCH
 
-config ES7000_CLUSTERED_APIC
-       def_bool y
-       depends on SMP && X86_ES7000 && MPENTIUMIII
-
 source "arch/x86/Kconfig.cpu"
 
 config HPET_TIMER
@@ -582,19 +600,20 @@ config IOMMU_HELPER
 
 config MAXSMP
        bool "Configure Maximum number of SMP Processors and NUMA Nodes"
-       depends on X86_64 && SMP && BROKEN
+       depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+       select CPUMASK_OFFSTACK
        default n
        help
          Configure maximum number of CPUS and NUMA Nodes for this architecture.
          If unsure, say N.
 
 config NR_CPUS
-       int "Maximum number of CPUs (2-512)" if !MAXSMP
-       range 2 512
-       depends on SMP
+       int "Maximum number of CPUs" if SMP && !MAXSMP
+       range 2 512 if SMP && !MAXSMP
+       default "1" if !SMP
        default "4096" if MAXSMP
-       default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
-       default "8"
+       default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+       default "8" if SMP
        help
          This allows you to specify the maximum number of CPUs which this
          kernel will support.  The maximum supported value is 512 and the
@@ -1632,13 +1651,6 @@ config APM_ALLOW_INTS
          many of the newer IBM Thinkpads.  If you experience hangs when you
          suspend, try setting this to Y.  Otherwise, say N.
 
-config APM_REAL_MODE_POWER_OFF
-       bool "Use real mode APM BIOS call to power off"
-       help
-         Use real mode APM BIOS calls to switch off the computer. This is
-         a work-around for a number of buggy BIOSes. Switch this option on if
-         your computer crashes instead of powering off properly.
-
 endif # APM
 
 source "arch/x86/kernel/cpu/cpufreq/Kconfig"
index b815664fe3700b77aa031fab7711be4128c858d1..85a78575956cb0bdf473c4694f29eb7d497a154e 100644 (file)
@@ -515,6 +515,7 @@ config CPU_SUP_UMC_32
 config X86_DS
        def_bool X86_PTRACE_BTS
        depends on X86_DEBUGCTLMSR
+       select HAVE_HW_BRANCH_TRACER
 
 config X86_PTRACE_BTS
        bool "Branch Trace Store"
index 2a3dfbd5e677b548e5e75f43da69e934b90a7eff..fa013f529b746564fd04695bd4a12fe65c401dca 100644 (file)
@@ -186,14 +186,10 @@ config IOMMU_LEAK
          Add a simple leak tracer to the IOMMU code. This is useful when you
          are debugging a buggy device driver that leaks IOMMU mappings.
 
-config MMIOTRACE_HOOKS
-       bool
-
 config MMIOTRACE
        bool "Memory mapped IO tracing"
        depends on DEBUG_KERNEL && PCI
        select TRACING
-       select MMIOTRACE_HOOKS
        help
          Mmiotrace traces Memory Mapped I/O access and is meant for
          debugging and reverse engineering. It is called from the ioremap
index 3b1510b4fc5741290d5660bb4c30f34bc70f6371..25caa0738af5f9d99399d3c2f36405bc62d7115b 100644 (file)
@@ -193,6 +193,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
 static inline void lapic_shutdown(void) { }
 #define local_apic_timer_c2_ok         1
 static inline void init_apic_mappings(void) { }
+static inline void disable_local_APIC(void) { }
 
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
index 1d9543b9d35824c259a1110005866be3d1f54a4e..976399debb3f1511a6b2c8faedf26c491fc97ce1 100644 (file)
@@ -9,12 +9,12 @@ static inline int apic_id_registered(void)
        return (1);
 }
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
 #ifdef CONFIG_SMP
-        return cpu_online_map;
+       return &cpu_online_map;
 #else
-        return cpumask_of_cpu(0);
+       return &cpumask_of_cpu(0);
 #endif
 }
 
@@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void)
 #define INT_DELIVERY_MODE      (dest_Fixed)
 #define INT_DEST_MODE          (0)    /* phys delivery to target proc */
 #define NO_BALANCE_IRQ         (0)
-#define WAKE_SECONDARY_VIA_INIT
-
 
 static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
 {
@@ -81,7 +79,7 @@ static inline int apicid_to_node(int logical_apicid)
 
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-       if (mps_cpu < NR_CPUS)
+       if (mps_cpu < nr_cpu_ids)
                return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
 
        return BAD_APICID;
@@ -96,7 +94,7 @@ extern u8 cpu_2_logical_apicid[];
 /* Mapping from cpu number to logical apicid */
 static inline int cpu_to_logical_apicid(int cpu)
 {
-       if (cpu >= NR_CPUS)
+       if (cpu >= nr_cpu_ids)
                return BAD_APICID;
        return cpu_physical_id(cpu);
 }
@@ -121,16 +119,32 @@ static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
 }
 
 /* As we are using single CPU as destination, pick only one CPU here */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int cpu;
        int apicid;     
 
-       cpu = first_cpu(cpumask);
+       cpu = first_cpu(*cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        return apicid;
 }
 
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       cpu = cpumask_any_and(cpumask, andmask);
+       if (cpu < nr_cpu_ids)
+               return cpu_to_logical_apicid(cpu);
+
+       return BAD_APICID;
+}
+
 static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
 {
        return cpuid_apic >> index_msb;
index 9404c535b7ecbbf6f496d0cd80f4eb266a529208..27fcd01b3ae6054ddf5c1b5c4145b4f9d913576e 100644 (file)
@@ -1,25 +1,22 @@
 #ifndef __ASM_MACH_IPI_H
 #define __ASM_MACH_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 
 static inline void send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(smp_processor_id(), mask);
-
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(cpu_online_mask, vector);
 }
 
 #endif /* __ASM_MACH_IPI_H */
index a95008457ea430ffbe7fdde3fc6a5fd57778ce79..99b6c39774a44179fcf3554a096449962167d490 100644 (file)
@@ -7,13 +7,12 @@
  *
  * It manages:
  * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
  * - buffer access
  *
  * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
 
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/err.h>
 
 
 #ifdef CONFIG_X86_DS
 
 struct task_struct;
+struct ds_tracer;
+struct bts_tracer;
+struct pebs_tracer;
+
+typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
+typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
 
 /*
  * Request BTS or PEBS
@@ -38,60 +44,62 @@ struct task_struct;
  * Due to alignement constraints, the actual buffer may be slightly
  * smaller than the requested or provided buffer.
  *
- * Returns 0 on success; -Eerrno otherwise
+ * Returns a pointer to a tracer structure on success, or
+ * ERR_PTR(errcode) on failure.
+ *
+ * The interrupt threshold is independent from the overflow callback
+ * to allow users to use their own overflow interrupt handling mechanism.
  *
  * task: the task to request recording for;
  *       NULL for per-cpu recording on the current cpu
  * base: the base pointer for the (non-pageable) buffer;
- *       NULL if buffer allocation requested
- * size: the size of the requested or provided buffer
+ * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
  *       NULL if cyclic buffer requested
+ * th: the interrupt threshold in records from the end of the buffer;
+ *     -1 if no interrupt threshold is requested.
  */
-typedef void (*ds_ovfl_callback_t)(struct task_struct *);
-extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
-                         ds_ovfl_callback_t ovfl);
-extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-                          ds_ovfl_callback_t ovfl);
+extern struct bts_tracer *ds_request_bts(struct task_struct *task,
+                                        void *base, size_t size,
+                                        bts_ovfl_callback_t ovfl, size_t th);
+extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+                                          void *base, size_t size,
+                                          pebs_ovfl_callback_t ovfl,
+                                          size_t th);
 
 /*
  * Release BTS or PEBS resources
  *
- * Frees buffers allocated on ds_request.
- *
  * Returns 0 on success; -Eerrno otherwise
  *
- * task: the task to release resources for;
- *       NULL to release resources for the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_release_bts(struct task_struct *task);
-extern int ds_release_pebs(struct task_struct *task);
+extern int ds_release_bts(struct bts_tracer *tracer);
+extern int ds_release_pebs(struct pebs_tracer *tracer);
 
 /*
- * Return the (array) index of the write pointer.
+ * Get the (array) index of the write pointer.
  * (assuming an array of BTS/PEBS records)
  *
- * Returns -Eerrno on error
+ * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * tracer: the tracer handle returned from ds_request_~()
+ * pos (out): will hold the result
  */
-extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
+extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos);
+extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos);
 
 /*
- * Return the (array) index one record beyond the end of the array.
+ * Get the (array) index one record beyond the end of the array.
  * (assuming an array of BTS/PEBS records)
  *
- * Returns -Eerrno on error
+ * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
- * pos (out): if not NULL, will hold the result
+ * tracer: the tracer handle returned from ds_request_~()
+ * pos (out): will hold the result
  */
-extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
-extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
+extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos);
+extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos);
 
 /*
  * Provide a pointer to the BTS/PEBS record at parameter index.
@@ -102,14 +110,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
  *
  * Returns the size of a single record on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  * index: the index of the requested record
  * record (out): pointer to the requested record
  */
-extern int ds_access_bts(struct task_struct *task,
+extern int ds_access_bts(struct bts_tracer *tracer,
                         size_t index, const void **record);
-extern int ds_access_pebs(struct task_struct *task,
+extern int ds_access_pebs(struct pebs_tracer *tracer,
                          size_t index, const void **record);
 
 /*
@@ -129,38 +136,24 @@ extern int ds_access_pebs(struct task_struct *task,
  *
  * Returns the number of bytes written or -Eerrno.
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  * buffer: the buffer to write
  * size: the size of the buffer
  */
-extern int ds_write_bts(struct task_struct *task,
+extern int ds_write_bts(struct bts_tracer *tracer,
                        const void *buffer, size_t size);
-extern int ds_write_pebs(struct task_struct *task,
+extern int ds_write_pebs(struct pebs_tracer *tracer,
                         const void *buffer, size_t size);
 
-/*
- * Same as ds_write_bts/pebs, but omit ownership checks.
- *
- * This is needed to have some other task than the owner of the
- * BTS/PEBS buffer or the parameter task itself write into the
- * respective buffer.
- */
-extern int ds_unchecked_write_bts(struct task_struct *task,
-                                 const void *buffer, size_t size);
-extern int ds_unchecked_write_pebs(struct task_struct *task,
-                                  const void *buffer, size_t size);
-
 /*
  * Reset the write pointer of the BTS/PEBS buffer.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_reset_bts(struct task_struct *task);
-extern int ds_reset_pebs(struct task_struct *task);
+extern int ds_reset_bts(struct bts_tracer *tracer);
+extern int ds_reset_pebs(struct pebs_tracer *tracer);
 
 /*
  * Clear the BTS/PEBS buffer and reset the write pointer.
@@ -168,33 +161,30 @@ extern int ds_reset_pebs(struct task_struct *task);
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_~()
  */
-extern int ds_clear_bts(struct task_struct *task);
-extern int ds_clear_pebs(struct task_struct *task);
+extern int ds_clear_bts(struct bts_tracer *tracer);
+extern int ds_clear_pebs(struct pebs_tracer *tracer);
 
 /*
  * Provide the PEBS counter reset value.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
  * value (out): the counter reset value
  */
-extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
+extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);
 
 /*
  * Set the PEBS counter reset value.
  *
  * Returns 0 on success; -Eerrno on error
  *
- * task: the task to access;
- *       NULL to access the current cpu
+ * tracer: the tracer handle returned from ds_request_pebs()
  * value: the new counter reset value
  */
-extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
 
 /*
  * Initialization
@@ -207,17 +197,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
 /*
  * The DS context - part of struct thread_struct.
  */
+#define MAX_SIZEOF_DS (12 * 8)
+
 struct ds_context {
        /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
-       unsigned char *ds;
+       unsigned char ds[MAX_SIZEOF_DS];
        /* the owner of the BTS and PEBS configuration, respectively */
-       struct task_struct *owner[2];
-       /* buffer overflow notification function for BTS and PEBS */
-       ds_ovfl_callback_t callback[2];
-       /* the original buffer address */
-       void *buffer[2];
-       /* the number of allocated pages for on-request allocated buffers */
-       unsigned int pages[2];
+       struct ds_tracer  *owner[2];
        /* use count */
        unsigned long count;
        /* a pointer to the context location inside the thread_struct
index 94826cf874557acedd7afe248daa588a0c6383dd..cc70c1c78ca47545bb7b927f878d3cb02275f142 100644 (file)
@@ -8,7 +8,9 @@ enum reboot_type {
        BOOT_BIOS = 'b',
 #endif
        BOOT_ACPI = 'a',
-       BOOT_EFI = 'e'
+       BOOT_EFI = 'e',
+       BOOT_CF9 = 'p',
+       BOOT_CF9_COND = 'q',
 };
 
 extern enum reboot_type reboot_type;
index 380f0b4f17edfcf5c1e1440baed7231e07cdb684..ba8423c5363faf5231ea1b01cc7757186b2bac18 100644 (file)
@@ -9,31 +9,27 @@ static inline int apic_id_registered(void)
                return (1);
 }
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus_cluster(void)
 {
-#if defined CONFIG_ES7000_CLUSTERED_APIC
-       return CPU_MASK_ALL;
-#else
-       return cpumask_of_cpu(smp_processor_id());
-#endif
+       return &CPU_MASK_ALL;
 }
 
-#if defined CONFIG_ES7000_CLUSTERED_APIC
-#define APIC_DFR_VALUE         (APIC_DFR_CLUSTER)
-#define INT_DELIVERY_MODE      (dest_LowestPrio)
-#define INT_DEST_MODE          (1)    /* logical delivery broadcast to all procs */
-#define NO_BALANCE_IRQ         (1)
-#undef  WAKE_SECONDARY_VIA_INIT
-#define WAKE_SECONDARY_VIA_MIP
-#else
+static inline const cpumask_t *target_cpus(void)
+{
+       return &cpumask_of_cpu(smp_processor_id());
+}
+
+#define APIC_DFR_VALUE_CLUSTER         (APIC_DFR_CLUSTER)
+#define INT_DELIVERY_MODE_CLUSTER      (dest_LowestPrio)
+#define INT_DEST_MODE_CLUSTER          (1) /* logical delivery broadcast to all procs */
+#define NO_BALANCE_IRQ_CLUSTER         (1)
+
 #define APIC_DFR_VALUE         (APIC_DFR_FLAT)
 #define INT_DELIVERY_MODE      (dest_Fixed)
 #define INT_DEST_MODE          (0)    /* phys delivery to target procs */
 #define NO_BALANCE_IRQ         (0)
 #undef  APIC_DEST_LOGICAL
 #define APIC_DEST_LOGICAL      0x0
-#define WAKE_SECONDARY_VIA_INIT
-#endif
 
 static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
 {
@@ -60,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu)
  * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
  * document number 292116).  So here it goes...
  */
+static inline void init_apic_ldr_cluster(void)
+{
+       unsigned long val;
+       int cpu = smp_processor_id();
+
+       apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER);
+       val = calculate_ldr(cpu);
+       apic_write(APIC_LDR, val);
+}
+
 static inline void init_apic_ldr(void)
 {
        unsigned long val;
@@ -70,17 +76,14 @@ static inline void init_apic_ldr(void)
        apic_write(APIC_LDR, val);
 }
 
-#ifndef CONFIG_X86_GENERICARCH
-extern void enable_apic_mode(void);
-#endif
-
 extern int apic_version [MAX_APICS];
 static inline void setup_apic_routing(void)
 {
        int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
-       printk("Enabling APIC mode:  %s.  Using %d I/O APICs, target cpus %lx\n",
+       printk("Enabling APIC mode:  %s. Using %d I/O APICs, target cpus %lx\n",
                (apic_version[apic] == 0x14) ?
-               "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
+                       "Physical Cluster" : "Logical Cluster",
+                       nr_ioapics, cpus_addr(*target_cpus())[0]);
 }
 
 static inline int multi_timer_check(int apic, int irq)
@@ -98,7 +101,7 @@ static inline int cpu_present_to_apicid(int mps_cpu)
 {
        if (!mps_cpu)
                return boot_cpu_physical_apicid;
-       else if (mps_cpu < NR_CPUS)
+       else if (mps_cpu < nr_cpu_ids)
                return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
        else
                return BAD_APICID;
@@ -118,9 +121,9 @@ extern u8 cpu_2_logical_apicid[];
 static inline int cpu_to_logical_apicid(int cpu)
 {
 #ifdef CONFIG_SMP
-       if (cpu >= NR_CPUS)
-              return BAD_APICID;
-       return (int)cpu_2_logical_apicid[cpu];
+       if (cpu >= nr_cpu_ids)
+               return BAD_APICID;
+       return (int)cpu_2_logical_apicid[cpu];
 #else
        return logical_smp_processor_id();
 #endif
@@ -144,16 +147,87 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid)
        return (1);
 }
 
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int
+cpu_mask_to_apicid_cluster(const struct cpumask *cpumask)
+{
+       int num_bits_set;
+       int cpus_found = 0;
+       int cpu;
+       int apicid;
+
+       num_bits_set = cpumask_weight(cpumask);
+       /* Return id to all */
+       if (num_bits_set == NR_CPUS)
+               return 0xFF;
+       /*
+        * The cpus in the mask must all be on the apic cluster.  If are not
+        * on the same apicid cluster return default value of TARGET_CPUS.
+        */
+       cpu = cpumask_first(cpumask);
+       apicid = cpu_to_logical_apicid(cpu);
+       while (cpus_found < num_bits_set) {
+               if (cpumask_test_cpu(cpu, cpumask)) {
+                       int new_apicid = cpu_to_logical_apicid(cpu);
+                       if (apicid_cluster(apicid) !=
+                                       apicid_cluster(new_apicid)){
+                               printk ("%s: Not a valid mask!\n", __func__);
+                               return 0xFF;
+                       }
+                       apicid = new_apicid;
+                       cpus_found++;
+               }
+               cpu++;
+       }
+       return apicid;
+}
+
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
 
-       num_bits_set = cpus_weight(cpumask);
+       num_bits_set = cpus_weight(*cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
+               return cpu_to_logical_apicid(0);
+       /*
+        * The cpus in the mask must all be on the apic cluster.  If are not
+        * on the same apicid cluster return default value of TARGET_CPUS.
+        */
+       cpu = first_cpu(*cpumask);
+       apicid = cpu_to_logical_apicid(cpu);
+       while (cpus_found < num_bits_set) {
+               if (cpu_isset(cpu, *cpumask)) {
+                       int new_apicid = cpu_to_logical_apicid(cpu);
+                       if (apicid_cluster(apicid) !=
+                                       apicid_cluster(new_apicid)){
+                               printk ("%s: Not a valid mask!\n", __func__);
+                               return cpu_to_logical_apicid(0);
+                       }
+                       apicid = new_apicid;
+                       cpus_found++;
+               }
+               cpu++;
+       }
+       return apicid;
+}
+
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int num_bits_set;
+       int num_bits_set2;
+       int cpus_found = 0;
+       int cpu;
+       int apicid = 0;
+
+       num_bits_set = cpumask_weight(cpumask);
+       num_bits_set2 = cpumask_weight(andmask);
+       num_bits_set = min(num_bits_set, num_bits_set2);
+       /* Return id to all */
+       if (num_bits_set >= nr_cpu_ids)
 #if defined CONFIG_ES7000_CLUSTERED_APIC
                return 0xFF;
 #else
@@ -163,14 +237,17 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-       cpu = first_cpu(cpumask);
+       cpu = cpumask_first_and(cpumask, andmask);
        apicid = cpu_to_logical_apicid(cpu);
+
        while (cpus_found < num_bits_set) {
-               if (cpu_isset(cpu, cpumask)) {
+               if (cpumask_test_cpu(cpu, cpumask) &&
+                   cpumask_test_cpu(cpu, andmask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
-                                       apicid_cluster(new_apicid)){
-                               printk ("%s: Not a valid mask!\n", __func__);
+                                       apicid_cluster(new_apicid)) {
+                               printk(KERN_WARNING
+                                       "%s: Not a valid mask!\n", __func__);
 #if defined CONFIG_ES7000_CLUSTERED_APIC
                                return 0xFF;
 #else
index 632a955fcc0a937b870f4be05322943928ccc754..7e8ed24d4b8a9895e824c0de34982e8844521827 100644 (file)
@@ -1,24 +1,22 @@
 #ifndef __ASM_ES7000_IPI_H
 #define __ASM_ES7000_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 
 static inline void send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(smp_processor_id(), mask);
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(cpu_online_mask, vector);
 }
 
 #endif /* __ASM_ES7000_IPI_H */
index 3984934619136119dfd7d95045d5930ab415e917..78f0daaee436333d9162013c62324dfe66bb1b9b 100644 (file)
@@ -1,36 +1,12 @@
 #ifndef __ASM_ES7000_WAKECPU_H
 #define __ASM_ES7000_WAKECPU_H
 
-/*
- * This file copes with machines that wakeup secondary CPUs by the
- * INIT, INIT, STARTUP sequence.
- */
-
-#ifdef CONFIG_ES7000_CLUSTERED_APIC
-#define WAKE_SECONDARY_VIA_MIP
-#else
-#define WAKE_SECONDARY_VIA_INIT
-#endif
-
-#ifdef WAKE_SECONDARY_VIA_MIP
-extern int es7000_start_cpu(int cpu, unsigned long eip);
-static inline int
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
-{
-       int boot_error = 0;
-       boot_error = es7000_start_cpu(phys_apicid, start_eip);
-       return boot_error;
-}
-#endif
-
-#define TRAMPOLINE_LOW phys_to_virt(0x467)
-#define TRAMPOLINE_HIGH phys_to_virt(0x469)
-
-#define boot_cpu_apicid boot_cpu_physical_apicid
+#define TRAMPOLINE_PHYS_LOW    0x467
+#define TRAMPOLINE_PHYS_HIGH   0x469
 
 static inline void wait_for_init_deassert(atomic_t *deassert)
 {
-#ifdef WAKE_SECONDARY_VIA_INIT
+#ifndef CONFIG_ES7000_CLUSTERED_APIC
        while (!atomic_read(deassert))
                cpu_relax();
 #endif
@@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
 {
 }
 
-#define inquire_remote_apic(apicid) do {               \
-               if (apic_verbosity >= APIC_DEBUG)       \
-                       __inquire_remote_apic(apicid);  \
-       } while (0)
+extern void __inquire_remote_apic(int apicid);
+
+static inline void inquire_remote_apic(int apicid)
+{
+       if (apic_verbosity >= APIC_DEBUG)
+               __inquire_remote_apic(apicid);
+}
 
 #endif /* __ASM_MACH_WAKECPU_H */
index 9e8bc29b8b17dd3739d7479af6920c3c627130cb..7e61b4ceb9a4c144f85a1eae55228e355ae390ae 100644 (file)
@@ -17,8 +17,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
         */
        return addr - 1;
 }
-#endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+struct dyn_arch_ftrace {
+       /* No extra data needed for x86 */
+};
+
+#endif /*  CONFIG_DYNAMIC_FTRACE */
+#endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifndef __ASSEMBLY__
+
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+       unsigned long ret;
+       unsigned long func;
+       unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry32.S
+ */
+extern void return_to_handler(void);
+
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 #endif /* _ASM_X86_FTRACE_H */
index 5cbd4fcc06fd20425ce21ead1386c47d92d0b152..746f37a7963adf9b9a7d91e197c1e0c50ce7b68a 100644 (file)
@@ -2,6 +2,7 @@
 #define _ASM_X86_GENAPIC_32_H
 
 #include <asm/mpspec.h>
+#include <asm/atomic.h>
 
 /*
  * Generic APIC driver interface.
@@ -23,7 +24,7 @@ struct genapic {
        int (*probe)(void);
 
        int (*apic_id_registered)(void);
-       cpumask_t (*target_cpus)(void);
+       const struct cpumask *(*target_cpus)(void);
        int int_delivery_mode;
        int int_dest_mode;
        int ESR_DISABLE;
@@ -56,15 +57,27 @@ struct genapic {
 
        unsigned (*get_apic_id)(unsigned long x);
        unsigned long apic_id_mask;
-       unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
-       cpumask_t (*vector_allocation_domain)(int cpu);
+       unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
+       unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+                                              const struct cpumask *andmask);
+       void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
 
 #ifdef CONFIG_SMP
        /* ipi */
-       void (*send_IPI_mask)(cpumask_t mask, int vector);
+       void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+       void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
+                                        int vector);
        void (*send_IPI_allbutself)(int vector);
        void (*send_IPI_all)(int vector);
 #endif
+       int (*wakeup_cpu)(int apicid, unsigned long start_eip);
+       int trampoline_phys_low;
+       int trampoline_phys_high;
+       void (*wait_for_init_deassert)(atomic_t *deassert);
+       void (*smp_callin_clear_local_apic)(void);
+       void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
+       void (*restore_NMI_vector)(unsigned short *high, unsigned short *low);
+       void (*inquire_remote_apic)(int apicid);
 };
 
 #define APICFUNC(x) .x = x,
@@ -105,16 +118,25 @@ struct genapic {
        APICFUNC(get_apic_id)                           \
        .apic_id_mask = APIC_ID_MASK,                   \
        APICFUNC(cpu_mask_to_apicid)                    \
-       APICFUNC(vector_allocation_domain)                      \
+       APICFUNC(cpu_mask_to_apicid_and)                \
+       APICFUNC(vector_allocation_domain)              \
        APICFUNC(acpi_madt_oem_check)                   \
        IPIFUNC(send_IPI_mask)                          \
        IPIFUNC(send_IPI_allbutself)                    \
        IPIFUNC(send_IPI_all)                           \
        APICFUNC(enable_apic_mode)                      \
        APICFUNC(phys_pkg_id)                           \
+       .trampoline_phys_low = TRAMPOLINE_PHYS_LOW,             \
+       .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH,           \
+       APICFUNC(wait_for_init_deassert)                \
+       APICFUNC(smp_callin_clear_local_apic)           \
+       APICFUNC(store_NMI_vector)                      \
+       APICFUNC(restore_NMI_vector)                    \
+       APICFUNC(inquire_remote_apic)                   \
 }
 
 extern struct genapic *genapic;
+extern void es7000_update_genapic_to_cluster(void);
 
 enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
 #define get_uv_system_type()           UV_NONE
index 13c4e96199ea17d1e52430dac24a85f4291f50d7..adf32fb56aa638d2aa160d25986f7c93ac6d3ba0 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_GENAPIC_64_H
 #define _ASM_X86_GENAPIC_64_H
 
+#include <linux/cpumask.h>
+
 /*
  * Copyright 2004 James Cleverdon, IBM.
  * Subject to the GNU Public License, v.2
@@ -18,20 +20,26 @@ struct genapic {
        u32 int_delivery_mode;
        u32 int_dest_mode;
        int (*apic_id_registered)(void);
-       cpumask_t (*target_cpus)(void);
-       cpumask_t (*vector_allocation_domain)(int cpu);
+       const struct cpumask *(*target_cpus)(void);
+       void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
        void (*init_apic_ldr)(void);
        /* ipi */
-       void (*send_IPI_mask)(cpumask_t mask, int vector);
+       void (*send_IPI_mask)(const struct cpumask *mask, int vector);
+       void (*send_IPI_mask_allbutself)(const struct cpumask *mask,
+                                        int vector);
        void (*send_IPI_allbutself)(int vector);
        void (*send_IPI_all)(int vector);
        void (*send_IPI_self)(int vector);
        /* */
-       unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
+       unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
+       unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+                                              const struct cpumask *andmask);
        unsigned int (*phys_pkg_id)(int index_msb);
        unsigned int (*get_apic_id)(unsigned long x);
        unsigned long (*set_apic_id)(unsigned int id);
        unsigned long apic_id_mask;
+       /* wakeup_secondary_cpu */
+       int (*wakeup_cpu)(int apicid, unsigned long start_eip);
 };
 
 extern struct genapic *genapic;
index 6afd9933a7dd96d96822a8ecb058cc2528ed09ac..25d527ca13622cdab95187ef505267a7cf5112a1 100644 (file)
@@ -188,17 +188,14 @@ extern void restore_IO_APIC_setup(void);
 extern void reinit_intr_remapped_IO_APIC(int);
 #endif
 
-extern int probe_nr_irqs(void);
+extern void probe_nr_irqs_gsi(void);
 
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
 static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void) { }
+static inline void ioapic_init_mappings(void)  { }
 
-static inline int probe_nr_irqs(void)
-{
-       return NR_IRQS;
-}
+static inline void probe_nr_irqs_gsi(void)     { }
 #endif
 
 #endif /* _ASM_X86_IO_APIC_H */
index f89dffb28aa93d3427c812ec059a538ac281aaef..c745a306f7d3572be1e5baa29a6cba7eca5b03c7 100644 (file)
@@ -117,7 +117,8 @@ static inline void __send_IPI_dest_field(unsigned int mask, int vector,
        native_apic_mem_write(APIC_ICR, cfg);
 }
 
-static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
+static inline void send_IPI_mask_sequence(const struct cpumask *mask,
+                                         int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
@@ -128,11 +129,29 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
         * - mbligh
         */
        local_irq_save(flags);
-       for_each_cpu_mask_nr(query_cpu, mask) {
+       for_each_cpu(query_cpu, mask) {
                __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
                                      vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
 }
 
+static inline void send_IPI_mask_allbutself(const struct cpumask *mask,
+                                           int vector)
+{
+       unsigned long flags;
+       unsigned int query_cpu;
+       unsigned int this_cpu = smp_processor_id();
+
+       /* See Hack comment above */
+
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask)
+               if (query_cpu != this_cpu)
+                       __send_IPI_dest_field(
+                               per_cpu(x86_cpu_to_apicid, query_cpu),
+                               vector, APIC_DEST_PHYSICAL);
+       local_irq_restore(flags);
+}
+
 #endif /* _ASM_X86_IPI_H */
index bae0eda954864aa62e90b6ec438b82c8785129dc..8766d30fb7461762d7ac3d721477c28ac185fad5 100644 (file)
@@ -37,7 +37,7 @@ extern int irqbalance_disable(char *str);
 
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
-extern void fixup_irqs(cpumask_t map);
+extern void fixup_irqs(void);
 #endif
 
 extern unsigned int do_IRQ(struct pt_regs *regs);
index 0005adb0f941f5f564cbad634538db8b149f0f46..f7ff65032b9d66aee077fcc3ddbcddd40bd4f75d 100644 (file)
 #define LAST_VM86_IRQ          15
 #define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
 
+#define NR_IRQS_LEGACY         16
+
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
+
+#ifndef CONFIG_SPARSE_IRQ
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
 # else
 #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
 # endif
+#else
+# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
+#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
+# else
+#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
+# endif
+#endif
 
 #elif defined(CONFIG_X86_VOYAGER)
 
index a1f22771a15a2d07a88d9e071b17ebb6860631ef..c61d8b2ab8b9d8ac00e8c514ee26028fe601eb86 100644 (file)
@@ -5,21 +5,8 @@
 # define PA_CONTROL_PAGE       0
 # define VA_CONTROL_PAGE       1
 # define PA_PGD                        2
-# define VA_PGD                        3
-# define PA_PTE_0              4
-# define VA_PTE_0              5
-# define PA_PTE_1              6
-# define VA_PTE_1              7
-# define PA_SWAP_PAGE          8
-# ifdef CONFIG_X86_PAE
-#  define PA_PMD_0             9
-#  define VA_PMD_0             10
-#  define PA_PMD_1             11
-#  define VA_PMD_1             12
-#  define PAGES_NR             13
-# else
-#  define PAGES_NR             9
-# endif
+# define PA_SWAP_PAGE          3
+# define PAGES_NR              4
 #else
 # define PA_CONTROL_PAGE       0
 # define VA_CONTROL_PAGE       1
@@ -170,6 +157,20 @@ relocate_kernel(unsigned long indirection_page,
                unsigned long start_address) ATTRIB_NORET;
 #endif
 
+#ifdef CONFIG_X86_32
+#define ARCH_HAS_KIMAGE_ARCH
+
+struct kimage_arch {
+       pgd_t *pgd;
+#ifdef CONFIG_X86_PAE
+       pmd_t *pmd0;
+       pmd_t *pmd1;
+#endif
+       pte_t *pte0;
+       pte_t *pte1;
+};
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_KEXEC_H */
index ff3a6c236c00c940d3cbc94d0895399fd25828f8..8863d978cb96f31a417d6abab49c6fcbe78232da 100644 (file)
@@ -8,12 +8,12 @@
 
 #define APIC_DFR_VALUE (APIC_DFR_FLAT)
 
-static inline cpumask_t target_cpus(void)
+static inline const struct cpumask *target_cpus(void)
 { 
 #ifdef CONFIG_SMP
-       return cpu_online_map;
+       return cpu_online_mask;
 #else
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 #endif
 } 
 
@@ -28,15 +28,18 @@ static inline cpumask_t target_cpus(void)
 #define apic_id_registered (genapic->apic_id_registered)
 #define init_apic_ldr (genapic->init_apic_ldr)
 #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
 #define phys_pkg_id    (genapic->phys_pkg_id)
 #define vector_allocation_domain    (genapic->vector_allocation_domain)
 #define read_apic_id()  (GET_APIC_ID(apic_read(APIC_ID)))
 #define send_IPI_self (genapic->send_IPI_self)
+#define wakeup_secondary_cpu (genapic->wakeup_cpu)
 extern void setup_apic_routing(void);
 #else
 #define INT_DELIVERY_MODE dest_LowestPrio
 #define INT_DEST_MODE 1     /* logical delivery broadcast to all procs */
 #define TARGET_CPUS (target_cpus())
+#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init
 /*
  * Set up the logical destination ID.
  *
@@ -59,9 +62,18 @@ static inline int apic_id_registered(void)
        return physid_isset(read_apic_id(), phys_cpu_present_map);
 }
 
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
-       return cpus_addr(cpumask)[0];
+       return cpumask_bits(cpumask)[0];
+}
+
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       unsigned long mask1 = cpumask_bits(cpumask)[0];
+       unsigned long mask2 = cpumask_bits(andmask)[0];
+
+       return (unsigned int)(mask1 & mask2);
 }
 
 static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
@@ -86,7 +98,7 @@ static inline int apicid_to_node(int logical_apicid)
 #endif
 }
 
-static inline cpumask_t vector_allocation_domain(int cpu)
+static inline void vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
         /* Careful. Some cpus do not strictly honor the set of cpus
          * specified in the interrupt destination when using lowest
@@ -96,8 +108,7 @@ static inline cpumask_t vector_allocation_domain(int cpu)
          * deliver interrupts to the wrong hyperthread when only one
          * hyperthread was specified in the interrupt desitination.
          */
-        cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-        return domain;
+       *retmask = (cpumask_t) { { [0] = APIC_ALL_CPUS } };
 }
 #endif
 
@@ -129,7 +140,7 @@ static inline int cpu_to_logical_apicid(int cpu)
 
 static inline int cpu_present_to_apicid(int mps_cpu)
 {
-       if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
+       if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
                return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
        else
                return BAD_APICID;
index fabca01ebacf29147f0174d00f220e7d64f8d32a..191312d155da7778dcc9cd9bab1b14da59349ff6 100644 (file)
@@ -4,7 +4,8 @@
 /* Avoid include hell */
 #define NMI_VECTOR 0x02
 
-void send_IPI_mask_bitmask(cpumask_t mask, int vector);
+void send_IPI_mask_bitmask(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 void __send_IPI_shortcut(unsigned int shortcut, int vector);
 
 extern int no_broadcast;
@@ -12,28 +13,27 @@ extern int no_broadcast;
 #ifdef CONFIG_X86_64
 #include <asm/genapic.h>
 #define send_IPI_mask (genapic->send_IPI_mask)
+#define send_IPI_mask_allbutself (genapic->send_IPI_mask_allbutself)
 #else
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_bitmask(mask, vector);
 }
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 #endif
 
 static inline void __local_send_IPI_allbutself(int vector)
 {
-       if (no_broadcast || vector == NMI_VECTOR) {
-               cpumask_t mask = cpu_online_map;
-
-               cpu_clear(smp_processor_id(), mask);
-               send_IPI_mask(mask, vector);
-       } else
+       if (no_broadcast || vector == NMI_VECTOR)
+               send_IPI_mask_allbutself(cpu_online_mask, vector);
+       else
                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
 }
 
 static inline void __local_send_IPI_all(int vector)
 {
        if (no_broadcast || vector == NMI_VECTOR)
-               send_IPI_mask(cpu_online_map, vector);
+               send_IPI_mask(cpu_online_mask, vector);
        else
                __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
 }
index 9d80db91e9926f040445ebffeb44cac66ce8f4d3..ceb01366014629ba1c8220dff2a95bb313a2bf30 100644 (file)
@@ -1,17 +1,8 @@
 #ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
 #define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
 
-/* 
- * This file copes with machines that wakeup secondary CPUs by the
- * INIT, INIT, STARTUP sequence.
- */
-
-#define WAKE_SECONDARY_VIA_INIT
-
-#define TRAMPOLINE_LOW phys_to_virt(0x467)
-#define TRAMPOLINE_HIGH phys_to_virt(0x469)
-
-#define boot_cpu_apicid boot_cpu_physical_apicid
+#define TRAMPOLINE_PHYS_LOW (0x467)
+#define TRAMPOLINE_PHYS_HIGH (0x469)
 
 static inline void wait_for_init_deassert(atomic_t *deassert)
 {
@@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
 {
 }
 
-#define inquire_remote_apic(apicid) do {               \
-               if (apic_verbosity >= APIC_DEBUG)       \
-                       __inquire_remote_apic(apicid);  \
-       } while (0)
+extern void __inquire_remote_apic(int apicid);
+
+static inline void inquire_remote_apic(int apicid)
+{
+       if (apic_verbosity >= APIC_DEBUG)
+               __inquire_remote_apic(apicid);
+}
 
 #endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
index dbab36d64d48f2b15aa6be7dea09e9feb91f869f..23bf52103b8905e662e3b9e5491b89d5b8c8c933 100644 (file)
@@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
        CMOS_WRITE(0xa, 0xf);
        local_flush_tlb();
        pr_debug("1.\n");
-       *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
+       *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+                                                                start_eip >> 4;
        pr_debug("2.\n");
-       *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
+       *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+                                                        start_eip & 0xf;
        pr_debug("3.\n");
 }
 
@@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
         */
        CMOS_WRITE(0, 0xf);
 
-       *((volatile long *) phys_to_virt(0x467)) = 0;
+       *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
 }
 
 static inline void __init smpboot_setup_io_apic(void)
index 5180bd7478fbc3d2cfc533f22e707a8099201e23..48553e958ad517fc72ca40eac6dc145c2fbc5f77 100644 (file)
 #define check_phys_apicid_present (genapic->check_phys_apicid_present)
 #define check_apicid_used (genapic->check_apicid_used)
 #define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
+#define cpu_mask_to_apicid_and (genapic->cpu_mask_to_apicid_and)
 #define vector_allocation_domain (genapic->vector_allocation_domain)
 #define enable_apic_mode (genapic->enable_apic_mode)
 #define phys_pkg_id (genapic->phys_pkg_id)
+#define wakeup_secondary_cpu (genapic->wakeup_cpu)
 
 extern void generic_bigsmp_probe(void);
 
diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
new file mode 100644 (file)
index 0000000..1ab16b1
--- /dev/null
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
+#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
+
+#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low)
+#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high)
+#define wait_for_init_deassert (genapic->wait_for_init_deassert)
+#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic)
+#define store_NMI_vector (genapic->store_NMI_vector)
+#define restore_NMI_vector (genapic->restore_NMI_vector)
+#define inquire_remote_apic (genapic->inquire_remote_apic)
+
+#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
index 0bf2a06b7a4e66a33629cc930f6d59c9c9907c92..c80f00d299656278baedc68be829650ec1b5c53c 100644 (file)
@@ -7,9 +7,9 @@
 
 #define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
-       return CPU_MASK_ALL;
+       return &CPU_MASK_ALL;
 }
 
 #define NO_BALANCE_IRQ (1)
@@ -122,7 +122,13 @@ static inline void enable_apic_mode(void)
  * We use physical apicids here, not logical, so just return the default
  * physical broadcast to stop people from breaking us
  */
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
+{
+       return (int) 0xF;
+}
+
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
 {
        return (int) 0xF;
 }
index 935588d286cf85c0ada9dbde94d83e74e081cf94..a8374c652778f5c23424b7e82a3f2aee93c1632d 100644 (file)
@@ -1,25 +1,22 @@
 #ifndef __ASM_NUMAQ_IPI_H
 #define __ASM_NUMAQ_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t, int vector);
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector);
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const struct cpumask *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
 
 static inline void send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(smp_processor_id(), mask);
-
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(cpu_online_mask, vector);
 }
 
 #endif /* __ASM_NUMAQ_IPI_H */
index c577bda5b1c52c768047d1e664e0040356b57176..6f499df8eddbfd4246132bd570b87bfde68b4e00 100644 (file)
@@ -3,12 +3,8 @@
 
 /* This file copes with machines that wakeup secondary CPUs by NMIs */
 
-#define WAKE_SECONDARY_VIA_NMI
-
-#define TRAMPOLINE_LOW phys_to_virt(0x8)
-#define TRAMPOLINE_HIGH phys_to_virt(0xa)
-
-#define boot_cpu_apicid boot_cpu_logical_apicid
+#define TRAMPOLINE_PHYS_LOW (0x8)
+#define TRAMPOLINE_PHYS_HIGH (0xa)
 
 /* We don't do anything here because we use NMI's to boot instead */
 static inline void wait_for_init_deassert(atomic_t *deassert)
@@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void)
 static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
 {
        printk("Storing NMI vector\n");
-       *high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
-       *low = *((volatile unsigned short *) TRAMPOLINE_LOW);
+       *high =
+         *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH));
+       *low =
+         *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW));
 }
 
 static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
 {
        printk("Restoring NMI vector\n");
-       *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high;
-       *((volatile unsigned short *) TRAMPOLINE_LOW) = *low;
+       *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
+                                                                *high;
+       *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
+                                                                *low;
 }
 
-#define inquire_remote_apic(apicid) {}
+static inline void inquire_remote_apic(int apicid)
+{
+}
 
 #endif /* __ASM_NUMAQ_WAKECPU_H */
index df7710354f851dc957c497b30a77f4c6468c01df..562d4fd31ba89abdfba10d1069621c19608ac983 100644 (file)
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_REBOOT_H
 #define _ASM_X86_REBOOT_H
 
+#include <linux/kdebug.h>
+
 struct pt_regs;
 
 struct machine_ops {
@@ -18,4 +20,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs);
 void native_machine_shutdown(void);
 void machine_real_restart(const unsigned char *code, int length);
 
+typedef void (*nmi_shootdown_cb)(int, struct die_args*);
+void nmi_shootdown_cpus(nmi_shootdown_cb callback);
+
 #endif /* _ASM_X86_REBOOT_H */
index f12d37237465df8b0a181d37685b212c0dd70cd1..294daeb3a006dbbe9eab0a659ac3708d802011fb 100644 (file)
@@ -16,6 +16,8 @@ static inline void visws_early_detect(void) { }
 static inline int is_visws_box(void) { return 0; }
 #endif
 
+extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
+extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
 /*
  * Any setup quirks to be performed?
  */
@@ -39,6 +41,7 @@ struct x86_quirks {
        void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
                                     unsigned short oemsize);
        int (*setup_ioapic_ids)(void);
+       int (*update_genapic)(void);
 };
 
 extern struct x86_quirks *x86_quirks;
index d12811ce51d92fc8d4be51c24981329ba2591aec..830b9fcb6427103689d005417f20ac19816b0766 100644 (file)
@@ -60,7 +60,7 @@ struct smp_ops {
        void (*cpu_die)(unsigned int cpu);
        void (*play_dead)(void);
 
-       void (*send_call_func_ipi)(cpumask_t mask);
+       void (*send_call_func_ipi)(const struct cpumask *mask);
        void (*send_call_func_single_ipi)(int cpu);
 };
 
@@ -125,7 +125,7 @@ static inline void arch_send_call_function_single_ipi(int cpu)
 
 static inline void arch_send_call_function_ipi(cpumask_t mask)
 {
-       smp_ops.send_call_func_ipi(mask);
+       smp_ops.send_call_func_ipi(&mask);
 }
 
 void cpu_disable_common(void);
@@ -138,7 +138,7 @@ void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
 
-void native_send_call_func_ipi(cpumask_t mask);
+void native_send_call_func_ipi(const struct cpumask *mask);
 void native_send_call_func_single_ipi(int cpu);
 
 extern void prefill_possible_map(void);
index 9b3070f1c2ac6fb80f0b53edf9b33b080af0f081..651a9384934128d7e24d85ffa51818687ef7b151 100644 (file)
 
 #define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
 
-static inline cpumask_t target_cpus(void)
+static inline const cpumask_t *target_cpus(void)
 {
        /* CPU_MASK_ALL (0xff) has undefined behaviour with
         * dest_LowestPrio mode logical clustered apic interrupt routing
         * Just start on cpu 0.  IRQ balancing will spread load
         */
-       return cpumask_of_cpu(0);
+       return &cpumask_of_cpu(0);
 }
 
 #define INT_DELIVERY_MODE (dest_LowestPrio)
@@ -137,14 +137,14 @@ static inline void enable_apic_mode(void)
 {
 }
 
-static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
+static inline unsigned int cpu_mask_to_apicid(const cpumask_t *cpumask)
 {
        int num_bits_set;
        int cpus_found = 0;
        int cpu;
        int apicid;
 
-       num_bits_set = cpus_weight(cpumask);
+       num_bits_set = cpus_weight(*cpumask);
        /* Return id to all */
        if (num_bits_set == NR_CPUS)
                return (int) 0xFF;
@@ -152,10 +152,10 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
         * The cpus in the mask must all be on the apic cluster.  If are not
         * on the same apicid cluster return default value of TARGET_CPUS.
         */
-       cpu = first_cpu(cpumask);
+       cpu = first_cpu(*cpumask);
        apicid = cpu_to_logical_apicid(cpu);
        while (cpus_found < num_bits_set) {
-               if (cpu_isset(cpu, cpumask)) {
+               if (cpu_isset(cpu, *cpumask)) {
                        int new_apicid = cpu_to_logical_apicid(cpu);
                        if (apicid_cluster(apicid) !=
                                        apicid_cluster(new_apicid)){
@@ -170,6 +170,45 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
        return apicid;
 }
 
+static inline unsigned int cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int num_bits_set;
+       int num_bits_set2;
+       int cpus_found = 0;
+       int cpu;
+       int apicid = 0;
+
+       num_bits_set = cpumask_weight(cpumask);
+       num_bits_set2 = cpumask_weight(andmask);
+       num_bits_set = min(num_bits_set, num_bits_set2);
+       /* Return id to all */
+       if (num_bits_set >= nr_cpu_ids)
+               return 0xFF;
+       /*
+        * The cpus in the mask must all be on the apic cluster.  If are not
+        * on the same apicid cluster return default value of TARGET_CPUS.
+        */
+       cpu = cpumask_first_and(cpumask, andmask);
+       apicid = cpu_to_logical_apicid(cpu);
+       while (cpus_found < num_bits_set) {
+               if (cpumask_test_cpu(cpu, cpumask)
+                   && cpumask_test_cpu(cpu, andmask)) {
+                       int new_apicid = cpu_to_logical_apicid(cpu);
+                       if (apicid_cluster(apicid) !=
+                                       apicid_cluster(new_apicid)) {
+                               printk(KERN_WARNING
+                                       "%s: Not a valid mask!\n", __func__);
+                               return 0xFF;
+                       }
+                       apicid = apicid | new_apicid;
+                       cpus_found++;
+               }
+               cpu++;
+       }
+       return apicid;
+}
+
 /* cpuid returns the value latched in the HW at reset, not the APIC ID
  * register's value.  For any box whose BIOS changes APIC IDs, like
  * clustered APIC systems, we must use hard_smp_processor_id.
index 53bd1e7bd7b448ac2d1f6ff3a25a496dc9c16358..a8a2c24f50ccfd34d8dca98cb399dcafc51a1c29 100644 (file)
@@ -1,9 +1,10 @@
 #ifndef __ASM_SUMMIT_IPI_H
 #define __ASM_SUMMIT_IPI_H
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector);
+void send_IPI_mask_sequence(const cpumask_t *mask, int vector);
+void send_IPI_mask_allbutself(const cpumask_t *mask, int vector);
 
-static inline void send_IPI_mask(cpumask_t mask, int vector)
+static inline void send_IPI_mask(const cpumask_t *mask, int vector)
 {
        send_IPI_mask_sequence(mask, vector);
 }
@@ -14,12 +15,12 @@ static inline void send_IPI_allbutself(int vector)
        cpu_clear(smp_processor_id(), mask);
 
        if (!cpus_empty(mask))
-               send_IPI_mask(mask, vector);
+               send_IPI_mask(&mask, vector);
 }
 
 static inline void send_IPI_all(int vector)
 {
-       send_IPI_mask(cpu_online_map, vector);
+       send_IPI_mask(&cpu_online_map, vector);
 }
 
 #endif /* __ASM_SUMMIT_IPI_H */
index 2ed3f0f44ff73519a149b37c9bd658a26361c8de..07c3e4048991c757663c3de65338a85259a0cd2a 100644 (file)
@@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
 
 void default_idle(void);
 
+void stop_this_cpu(void *dummy);
+
 /*
  * Force strict CPU ordering.
  * And yes, this is required on UP too when we're talking
index e44d379faad2b891245ef9cc2332dcaf8f8491c5..0921b4018c11d1a926c3eb083c55d9ba4f45370f 100644 (file)
@@ -20,6 +20,8 @@
 struct task_struct;
 struct exec_domain;
 #include <asm/processor.h>
+#include <asm/ftrace.h>
+#include <asm/atomic.h>
 
 struct thread_info {
        struct task_struct      *task;          /* main task structure */
index ff386ff50ed766d5d8c4e661d9ef2c3acef3162f..79e31e9dcdda9f924d17489dc1e0b134995c764b 100644 (file)
@@ -226,6 +226,8 @@ extern cpumask_t cpu_coregroup_map(int cpu);
 #define topology_core_id(cpu)                  (cpu_data(cpu).cpu_core_id)
 #define topology_core_siblings(cpu)            (per_cpu(cpu_core_map, cpu))
 #define topology_thread_siblings(cpu)          (per_cpu(cpu_sibling_map, cpu))
+#define topology_core_cpumask(cpu)             (&per_cpu(cpu_core_map, cpu))
+#define topology_thread_cpumask(cpu)           (&per_cpu(cpu_sibling_map, cpu))
 
 /* indicates that pointers to the topology cpumask_t maps are valid */
 #define arch_provides_topology_pointers                yes
index 35c54921b2e434cdd95bbd223077f2b1155d0f1f..99192bb55a53bf68afc30efeae89b9b40b5f4b4d 100644 (file)
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
        int __ret_gu;                                                   \
        unsigned long __val_gu;                                         \
        __chk_user_ptr(ptr);                                            \
+       might_fault();                                                  \
        switch (sizeof(*(ptr))) {                                       \
        case 1:                                                         \
                __get_user_x(1, __ret_gu, __val_gu, ptr);               \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
        int __ret_pu;                                           \
        __typeof__(*(ptr)) __pu_val;                            \
        __chk_user_ptr(ptr);                                    \
+       might_fault();                                          \
        __pu_val = x;                                           \
        switch (sizeof(*(ptr))) {                               \
        case 1:                                                 \
index d095a3aeea1b44d3063f0165c956d0a156c6ca38..5e06259e90e5a736539948ae22e1de25444ba485 100644 (file)
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long __must_check
 __copy_to_user(void __user *to, const void *from, unsigned long n)
 {
-       might_sleep();
-       return __copy_to_user_inatomic(to, from, n);
+       might_fault();
+       return __copy_to_user_inatomic(to, from, n);
 }
 
 static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long
 __copy_from_user(void *to, const void __user *from, unsigned long n)
 {
-       might_sleep();
+       might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
 
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
 static __always_inline unsigned long __copy_from_user_nocache(void *to,
                                const void __user *from, unsigned long n)
 {
-       might_sleep();
+       might_fault();
        if (__builtin_constant_p(n)) {
                unsigned long ret;
 
index f8cfd00db450f2e0f948ce7128a2d44867aebf59..84210c479fca83524c6cef4c6bc069bcff76e272 100644 (file)
@@ -29,6 +29,8 @@ static __always_inline __must_check
 int __copy_from_user(void *dst, const void __user *src, unsigned size)
 {
        int ret = 0;
+
+       might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic(dst, (__force void *)src, size);
        switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
 int __copy_to_user(void __user *dst, const void *src, unsigned size)
 {
        int ret = 0;
+
+       might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst, src, size);
        switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
 int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 {
        int ret = 0;
+
+       might_fault();
        if (!__builtin_constant_p(size))
                return copy_user_generic((__force void *)dst,
                                         (__force void *)src, size);
index b62a7667828eb77574128a8ca82b1f54cee28238..1cad9318d2172f16e3269dda122b093a54ef4107 100644 (file)
@@ -25,7 +25,7 @@ CFLAGS_tsc.o          := $(nostackp)
 
 obj-y                  := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
 obj-y                  += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y                  += time_$(BITS).o ioport.o ldt.o
+obj-y                  += time_$(BITS).o ioport.o ldt.o dumpstack.o
 obj-y                  += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
 obj-$(CONFIG_X86_VISWS)        += visws_quirks.o
 obj-$(CONFIG_X86_32)   += probe_roms_32.o
@@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)  += apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)   += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER)    += ftrace.o
 obj-$(CONFIG_KEXEC)            += machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)            += relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)       += crash_dump_$(BITS).o
index 4c51a2f8fd315900071de65e6ee3bee43ea2718f..65d0b72777ea099a99ced450bec404220af6e2e2 100644 (file)
@@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void)
                        disable_acpi();
                }
        }
+
+       /*
+        * ACPI supports both logical (e.g. Hyper-Threading) and physical
+        * processors, where MPS only supports physical.
+        */
+       if (acpi_lapic && acpi_ioapic)
+               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+                      "information\n");
+       else if (acpi_lapic)
+               printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+                      "configuration information\n");
 #endif
        return;
 }
index 16f94879b52578aba3e689d7a8adcccf59ab8ecb..edda4c00e3d2de97e9cc68bbc8c4fc98d0963847 100644 (file)
@@ -141,7 +141,7 @@ static int lapic_next_event(unsigned long delta,
                            struct clock_event_device *evt);
 static void lapic_timer_setup(enum clock_event_mode mode,
                              struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
+static void lapic_timer_broadcast(const cpumask_t *mask);
 static void apic_pm_activate(void);
 
 /*
@@ -453,7 +453,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 /*
  * Local APIC timer broadcast function
  */
-static void lapic_timer_broadcast(cpumask_t mask)
+static void lapic_timer_broadcast(const cpumask_t *mask)
 {
 #ifdef CONFIG_SMP
        send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@ -469,7 +469,7 @@ static void __cpuinit setup_APIC_timer(void)
        struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
        memcpy(levt, &lapic_clockevent, sizeof(*levt));
-       levt->cpumask = cpumask_of_cpu(smp_processor_id());
+       levt->cpumask = cpumask_of(smp_processor_id());
 
        clockevents_register_device(levt);
 }
@@ -1903,8 +1903,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
        }
 #endif
 
-       cpu_set(cpu, cpu_possible_map);
-       cpu_set(cpu, cpu_present_map);
+       set_cpu_possible(cpu, true);
+       set_cpu_present(cpu, true);
 }
 
 #ifdef CONFIG_X86_64
@@ -2106,7 +2106,7 @@ __cpuinit int apic_is_clustered_box(void)
        bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
        bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                /* are we being called early in kernel startup? */
                if (bios_cpu_apicid) {
                        id = bios_cpu_apicid[i];
index 5145a6e72bbbf7f6a4e93d85aed20c769f0883dc..3a26525a3f31a03f3490f7e17be537a6fb3e7c7e 100644 (file)
@@ -391,11 +391,7 @@ static int power_off;
 #else
 static int power_off = 1;
 #endif
-#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
-static int realmode_power_off = 1;
-#else
 static int realmode_power_off;
-#endif
 #ifdef CONFIG_APM_ALLOW_INTS
 static int allow_ints = 1;
 #else
index 8e48c5d4467df61d3652a39b06b85e7389143c54..88ea02dcb622fe5bf3ef4f2d7a28fc31bdf938e0 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
        unsigned int next_perf_state = 0; /* Index into perf table */
        unsigned int i;
        int result = 0;
+       struct power_trace it;
 
        dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
                }
        }
 
+       trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
        switch (data->cpu_feature) {
        case SYSTEM_INTEL_MSR_CAPABLE:
                cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
index cce0b6118d550e015a34c5e1b0f9bc60de1f8b3d..816f27f289b10a416847a141b20b1ed9540a530b 100644 (file)
@@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
                set_cpu_cap(c, X86_FEATURE_P4);
        if (c->x86 == 6)
                set_cpu_cap(c, X86_FEATURE_P3);
+#endif
 
        if (cpu_has_bts)
                ptrace_bts_init_intel(c);
 
-#endif
-
        detect_extended_topology(c);
        if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
                /*
index 3f46afbb1cf1946f52212ce96942429a9885cb04..fb7f946cb65ec139e6ec9c5ada4900c757482df3 100644 (file)
@@ -534,31 +534,16 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
        per_cpu(cpuid4_info, cpu) = NULL;
 }
 
-static int __cpuinit detect_cache_attributes(unsigned int cpu)
+static void get_cpu_leaves(void *_retval)
 {
-       struct _cpuid4_info     *this_leaf;
-       unsigned long           j;
-       int                     retval;
-       cpumask_t               oldmask;
-
-       if (num_cache_leaves == 0)
-               return -ENOENT;
-
-       per_cpu(cpuid4_info, cpu) = kzalloc(
-           sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
-       if (per_cpu(cpuid4_info, cpu) == NULL)
-               return -ENOMEM;
-
-       oldmask = current->cpus_allowed;
-       retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-       if (retval)
-               goto out;
+       int j, *retval = _retval, cpu = smp_processor_id();
 
        /* Do cpuid and store the results */
        for (j = 0; j < num_cache_leaves; j++) {
+               struct _cpuid4_info *this_leaf;
                this_leaf = CPUID4_INFO_IDX(cpu, j);
-               retval = cpuid4_cache_lookup(j, this_leaf);
-               if (unlikely(retval < 0)) {
+               *retval = cpuid4_cache_lookup(j, this_leaf);
+               if (unlikely(*retval < 0)) {
                        int i;
 
                        for (i = 0; i < j; i++)
@@ -567,9 +552,21 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
                }
                cache_shared_cpu_map_setup(cpu, j);
        }
-       set_cpus_allowed_ptr(current, &oldmask);
+}
+
+static int __cpuinit detect_cache_attributes(unsigned int cpu)
+{
+       int                     retval;
+
+       if (num_cache_leaves == 0)
+               return -ENOENT;
+
+       per_cpu(cpuid4_info, cpu) = kzalloc(
+           sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+       if (per_cpu(cpuid4_info, cpu) == NULL)
+               return -ENOMEM;
 
-out:
+       smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
        if (retval) {
                kfree(per_cpu(cpuid4_info, cpu));
                per_cpu(cpuid4_info, cpu) = NULL;
@@ -626,8 +623,8 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
                cpumask_t *mask = &this_leaf->shared_cpu_map;
 
                n = type?
-                       cpulist_scnprintf(buf, len-2, *mask):
-                       cpumask_scnprintf(buf, len-2, *mask);
+                       cpulist_scnprintf(buf, len-2, mask) :
+                       cpumask_scnprintf(buf, len-2, mask);
                buf[n++] = '\n';
                buf[n] = '\0';
        }
index 5eb390a4b2e9159d980fcfcbc68f8d1d4f42c258..a1de80f368f13898043b4b93ea4599a9122d9838 100644 (file)
@@ -83,34 +83,41 @@ static DEFINE_PER_CPU(unsigned char, bank_map);     /* see which banks are on */
  * CPU Initialization
  */
 
+struct thresh_restart {
+       struct threshold_block *b;
+       int reset;
+       u16 old_limit;
+};
+
 /* must be called with correct cpu affinity */
-static void threshold_restart_bank(struct threshold_block *b,
-                                  int reset, u16 old_limit)
+static long threshold_restart_bank(void *_tr)
 {
+       struct thresh_restart *tr = _tr;
        u32 mci_misc_hi, mci_misc_lo;
 
-       rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+       rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
 
-       if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
-               reset = 1;      /* limit cannot be lower than err count */
+       if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+               tr->reset = 1;  /* limit cannot be lower than err count */
 
-       if (reset) {            /* reset err count and overflow bit */
+       if (tr->reset) {                /* reset err count and overflow bit */
                mci_misc_hi =
                    (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
-                   (THRESHOLD_MAX - b->threshold_limit);
-       } else if (old_limit) { /* change limit w/o reset */
+                   (THRESHOLD_MAX - tr->b->threshold_limit);
+       } else if (tr->old_limit) {     /* change limit w/o reset */
                int new_count = (mci_misc_hi & THRESHOLD_MAX) +
-                   (old_limit - b->threshold_limit);
+                   (tr->old_limit - tr->b->threshold_limit);
                mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
                    (new_count & THRESHOLD_MAX);
        }
 
-       b->interrupt_enable ?
+       tr->b->interrupt_enable ?
            (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
            (mci_misc_hi &= ~MASK_INT_TYPE_HI);
 
        mci_misc_hi |= MASK_COUNT_EN_HI;
-       wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+       wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+       return 0;
 }
 
 /* cpu init entry point, called from mce.c with preempt off */
@@ -120,6 +127,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
        unsigned int cpu = smp_processor_id();
        u8 lvt_off;
        u32 low = 0, high = 0, address = 0;
+       struct thresh_restart tr;
 
        for (bank = 0; bank < NR_BANKS; ++bank) {
                for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,7 +170,10 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c)
                        wrmsr(address, low, high);
 
                        threshold_defaults.address = address;
-                       threshold_restart_bank(&threshold_defaults, 0, 0);
+                       tr.b = &threshold_defaults;
+                       tr.reset = 0;
+                       tr.old_limit = 0;
+                       threshold_restart_bank(&tr);
                }
        }
 }
@@ -251,20 +262,6 @@ struct threshold_attr {
        ssize_t(*store) (struct threshold_block *, const char *, size_t count);
 };
 
-static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
-                                          cpumask_t *newmask)
-{
-       *oldmask = current->cpus_allowed;
-       cpus_clear(*newmask);
-       cpu_set(cpu, *newmask);
-       set_cpus_allowed_ptr(current, newmask);
-}
-
-static void affinity_restore(const cpumask_t *oldmask)
-{
-       set_cpus_allowed_ptr(current, oldmask);
-}
-
 #define SHOW_FIELDS(name)                                           \
 static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
 {                                                                   \
@@ -277,15 +274,16 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
                                      const char *buf, size_t count)
 {
        char *end;
-       cpumask_t oldmask, newmask;
+       struct thresh_restart tr;
        unsigned long new = simple_strtoul(buf, &end, 0);
        if (end == buf)
                return -EINVAL;
        b->interrupt_enable = !!new;
 
-       affinity_set(b->cpu, &oldmask, &newmask);
-       threshold_restart_bank(b, 0, 0);
-       affinity_restore(&oldmask);
+       tr.b = b;
+       tr.reset = 0;
+       tr.old_limit = 0;
+       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
 
        return end - buf;
 }
@@ -294,8 +292,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
                                     const char *buf, size_t count)
 {
        char *end;
-       cpumask_t oldmask, newmask;
-       u16 old;
+       struct thresh_restart tr;
        unsigned long new = simple_strtoul(buf, &end, 0);
        if (end == buf)
                return -EINVAL;
@@ -303,34 +300,36 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
                new = THRESHOLD_MAX;
        if (new < 1)
                new = 1;
-       old = b->threshold_limit;
+       tr.old_limit = b->threshold_limit;
        b->threshold_limit = new;
+       tr.b = b;
+       tr.reset = 0;
 
-       affinity_set(b->cpu, &oldmask, &newmask);
-       threshold_restart_bank(b, 0, old);
-       affinity_restore(&oldmask);
+       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
 
        return end - buf;
 }
 
-static ssize_t show_error_count(struct threshold_block *b, char *buf)
+static long local_error_count(void *_b)
 {
-       u32 high, low;
-       cpumask_t oldmask, newmask;
-       affinity_set(b->cpu, &oldmask, &newmask);
+       struct threshold_block *b = _b;
+       u32 low, high;
+
        rdmsr(b->address, low, high);
-       affinity_restore(&oldmask);
-       return sprintf(buf, "%x\n",
-                      (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+       return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+}
+
+static ssize_t show_error_count(struct threshold_block *b, char *buf)
+{
+       return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
 }
 
 static ssize_t store_error_count(struct threshold_block *b,
                                 const char *buf, size_t count)
 {
-       cpumask_t oldmask, newmask;
-       affinity_set(b->cpu, &oldmask, &newmask);
-       threshold_restart_bank(b, 1, 0);
-       affinity_restore(&oldmask);
+       struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+
+       work_on_cpu(b->cpu, threshold_restart_bank, &tr);
        return 1;
 }
 
@@ -463,12 +462,19 @@ out_free:
        return err;
 }
 
+static long local_allocate_threshold_blocks(void *_bank)
+{
+       unsigned int *bank = _bank;
+
+       return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+                                        MSR_IA32_MC0_MISC + *bank * 4);
+}
+
 /* symlinks sibling shared banks to first core.  first core owns dir/files. */
 static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 {
        int i, err = 0;
        struct threshold_bank *b = NULL;
-       cpumask_t oldmask, newmask;
        char name[32];
 
        sprintf(name, "threshold_bank%i", bank);
@@ -519,11 +525,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 
        per_cpu(threshold_banks, cpu)[bank] = b;
 
-       affinity_set(cpu, &oldmask, &newmask);
-       err = allocate_threshold_blocks(cpu, bank, 0,
-                                       MSR_IA32_MC0_MISC + bank * 4);
-       affinity_restore(&oldmask);
-
+       err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
        if (err)
                goto out_free;
 
index 2685538179097a1e8e073ebf07cd49e898ef4f6e..d84a852e4cd76f4983bc617d674c7e59c0b3632d 100644 (file)
 
 #include <mach_ipi.h>
 
-/* This keeps a track of which one is crashing cpu. */
-static int crashing_cpu;
 
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
-static atomic_t waiting_for_crash_ipi;
 
-static int crash_nmi_callback(struct notifier_block *self,
-                       unsigned long val, void *data)
+static void kdump_nmi_callback(int cpu, struct die_args *args)
 {
        struct pt_regs *regs;
 #ifdef CONFIG_X86_32
        struct pt_regs fixed_regs;
 #endif
-       int cpu;
 
-       if (val != DIE_NMI_IPI)
-               return NOTIFY_OK;
-
-       regs = ((struct die_args *)data)->regs;
-       cpu = raw_smp_processor_id();
-
-       /* Don't do anything if this handler is invoked on crashing cpu.
-        * Otherwise, system will completely hang. Crashing cpu can get
-        * an NMI if system was initially booted with nmi_watchdog parameter.
-        */
-       if (cpu == crashing_cpu)
-               return NOTIFY_STOP;
-       local_irq_disable();
+       regs = args->regs;
 
 #ifdef CONFIG_X86_32
        if (!user_mode_vm(regs)) {
@@ -65,54 +48,19 @@ static int crash_nmi_callback(struct notifier_block *self,
        }
 #endif
        crash_save_cpu(regs, cpu);
-       disable_local_APIC();
-       atomic_dec(&waiting_for_crash_ipi);
-       /* Assume hlt works */
-       halt();
-       for (;;)
-               cpu_relax();
-
-       return 1;
-}
 
-static void smp_send_nmi_allbutself(void)
-{
-       cpumask_t mask = cpu_online_map;
-       cpu_clear(safe_smp_processor_id(), mask);
-       if (!cpus_empty(mask))
-               send_IPI_mask(mask, NMI_VECTOR);
+       disable_local_APIC();
 }
 
-static struct notifier_block crash_nmi_nb = {
-       .notifier_call = crash_nmi_callback,
-};
-
-static void nmi_shootdown_cpus(void)
+static void kdump_nmi_shootdown_cpus(void)
 {
-       unsigned long msecs;
-
-       atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
-       /* Would it be better to replace the trap vector here? */
-       if (register_die_notifier(&crash_nmi_nb))
-               return;         /* return what? */
-       /* Ensure the new callback function is set before sending
-        * out the NMI
-        */
-       wmb();
+       nmi_shootdown_cpus(kdump_nmi_callback);
 
-       smp_send_nmi_allbutself();
-
-       msecs = 1000; /* Wait at most a second for the other cpus to stop */
-       while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
-               mdelay(1);
-               msecs--;
-       }
-
-       /* Leave the nmi callback set */
        disable_local_APIC();
 }
+
 #else
-static void nmi_shootdown_cpus(void)
+static void kdump_nmi_shootdown_cpus(void)
 {
        /* There are no cpus to shootdown */
 }
@@ -131,9 +79,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
        /* The kernel is broken so disable interrupts */
        local_irq_disable();
 
-       /* Make a note of crashing cpu. Will be used in NMI callback.*/
-       crashing_cpu = safe_smp_processor_id();
-       nmi_shootdown_cpus();
+       kdump_nmi_shootdown_cpus();
        lapic_shutdown();
 #if defined(CONFIG_X86_IO_APIC)
        disable_IO_APIC();
index a2d1176c38ee0d59b3e47925a79d8aebdff73d20..19a8c2c0389f1ab87dfe41193e04a5509f257130 100644 (file)
@@ -7,13 +7,12 @@
  *
  * It manages:
  * - per-thread and per-cpu allocation of BTS and PEBS
- * - buffer memory allocation (optional)
- * - buffer overflow handling
+ * - buffer overflow handling (to be done)
  * - buffer access
  *
  * It assumes:
- * - get_task_struct on all parameter tasks
- * - current is allowed to trace parameter tasks
+ * - get_task_struct on all traced tasks
+ * - current is allowed to trace tasks
  *
  *
  * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/kernel.h>
 
 
 /*
@@ -44,6 +44,33 @@ struct ds_configuration {
 };
 static struct ds_configuration ds_cfg;
 
+/*
+ * A BTS or PEBS tracer.
+ *
+ * This holds the configuration of the tracer and serves as a handle
+ * to identify tracers.
+ */
+struct ds_tracer {
+       /* the DS context (partially) owned by this tracer */
+       struct ds_context *context;
+       /* the buffer provided on ds_request() and its size in bytes */
+       void *buffer;
+       size_t size;
+};
+
+struct bts_tracer {
+       /* the common DS part */
+       struct ds_tracer ds;
+       /* buffer overflow notification function */
+       bts_ovfl_callback_t ovfl;
+};
+
+struct pebs_tracer {
+       /* the common DS part */
+       struct ds_tracer ds;
+       /* buffer overflow notification function */
+       pebs_ovfl_callback_t ovfl;
+};
 
 /*
  * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -107,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
        (*(unsigned long *)base) = value;
 }
 
+#define DS_ALIGNMENT (1 << 3)  /* BTS and PEBS buffer alignment */
 
-/*
- * Locking is done only for allocating BTS or PEBS resources and for
- * guarding context and buffer memory allocation.
- *
- * Most functions require the current task to own the ds context part
- * they are going to access. All the locking is done when validating
- * access to the context.
- */
-static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
 /*
- * Validate that the current task is allowed to access the BTS/PEBS
- * buffer of the parameter task.
- *
- * Returns 0, if access is granted; -Eerrno, otherwise.
+ * Locking is done only for allocating BTS or PEBS resources.
  */
-static inline int ds_validate_access(struct ds_context *context,
-                                    enum ds_qualifier qual)
-{
-       if (!context)
-               return -EPERM;
-
-       if (context->owner[qual] == current)
-               return 0;
-
-       return -EPERM;
-}
+static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
 
 /*
@@ -183,50 +189,12 @@ static inline int check_tracer(struct task_struct *task)
  *
  * Contexts are use-counted. They are allocated on first access and
  * deallocated when the last user puts the context.
- *
- * We distinguish between an allocating and a non-allocating get of a
- * context:
- * - the allocating get is used for requesting BTS/PEBS resources. It
- *   requires the caller to hold the global ds_lock.
- * - the non-allocating get is used for all other cases. A
- *   non-existing context indicates an error. It acquires and releases
- *   the ds_lock itself for obtaining the context.
- *
- * A context and its DS configuration are allocated and deallocated
- * together. A context always has a DS configuration of the
- * appropriate size.
  */
 static DEFINE_PER_CPU(struct ds_context *, system_context);
 
 #define this_system_context per_cpu(system_context, smp_processor_id())
 
-/*
- * Returns the pointer to the parameter task's context or to the
- * system-wide context, if task is NULL.
- *
- * Increases the use count of the returned context, if not NULL.
- */
 static inline struct ds_context *ds_get_context(struct task_struct *task)
-{
-       struct ds_context *context;
-       unsigned long irq;
-
-       spin_lock_irqsave(&ds_lock, irq);
-
-       context = (task ? task->thread.ds_ctx : this_system_context);
-       if (context)
-               context->count++;
-
-       spin_unlock_irqrestore(&ds_lock, irq);
-
-       return context;
-}
-
-/*
- * Same as ds_get_context, but allocates the context and it's DS
- * structure, if necessary; returns NULL; if out of memory.
- */
-static inline struct ds_context *ds_alloc_context(struct task_struct *task)
 {
        struct ds_context **p_context =
                (task ? &task->thread.ds_ctx : &this_system_context);
@@ -238,16 +206,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
                if (!context)
                        return NULL;
 
-               context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
-               if (!context->ds) {
-                       kfree(context);
-                       return NULL;
-               }
-
                spin_lock_irqsave(&ds_lock, irq);
 
                if (*p_context) {
-                       kfree(context->ds);
                        kfree(context);
 
                        context = *p_context;
@@ -272,10 +233,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
        return context;
 }
 
-/*
- * Decreases the use count of the parameter context, if not NULL.
- * Deallocates the context, if the use count reaches zero.
- */
 static inline void ds_put_context(struct ds_context *context)
 {
        unsigned long irq;
@@ -296,13 +253,6 @@ static inline void ds_put_context(struct ds_context *context)
        if (!context->task || (context->task == current))
                wrmsrl(MSR_IA32_DS_AREA, 0);
 
-       put_tracer(context->task);
-
-       /* free any leftover buffers from tracers that did not
-        * deallocate them properly. */
-       kfree(context->buffer[ds_bts]);
-       kfree(context->buffer[ds_pebs]);
-       kfree(context->ds);
        kfree(context);
  out:
        spin_unlock_irqrestore(&ds_lock, irq);
@@ -312,345 +262,342 @@ static inline void ds_put_context(struct ds_context *context)
 /*
  * Handle a buffer overflow
  *
- * task: the task whose buffers are overflowing;
- *       NULL for a buffer overflow on the current cpu
  * context: the ds context
  * qual: the buffer type
  */
-static void ds_overflow(struct task_struct *task, struct ds_context *context,
-                       enum ds_qualifier qual)
-{
-       if (!context)
-               return;
-
-       if (context->callback[qual])
-               (*context->callback[qual])(task);
-
-       /* todo: do some more overflow handling */
+static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
+{
+       switch (qual) {
+       case ds_bts: {
+               struct bts_tracer *tracer =
+                       container_of(context->owner[qual],
+                                    struct bts_tracer, ds);
+               if (tracer->ovfl)
+                       tracer->ovfl(tracer);
+       }
+               break;
+       case ds_pebs: {
+               struct pebs_tracer *tracer =
+                       container_of(context->owner[qual],
+                                    struct pebs_tracer, ds);
+               if (tracer->ovfl)
+                       tracer->ovfl(tracer);
+       }
+               break;
+       }
 }
 
 
-/*
- * Allocate a non-pageable buffer of the parameter size.
- * Checks the memory and the locked memory rlimit.
- *
- * Returns the buffer, if successful;
- *         NULL, if out of memory or rlimit exceeded.
- *
- * size: the requested buffer size in bytes
- * pages (out): if not NULL, contains the number of pages reserved
- */
-static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
+static void ds_install_ds_config(struct ds_context *context,
+                                enum ds_qualifier qual,
+                                void *base, size_t size, size_t ith)
 {
-       unsigned long rlim, vm, pgsz;
-       void *buffer;
-
-       pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-
-       rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->total_vm  + pgsz;
-       if (rlim < vm)
-               return NULL;
+       unsigned long buffer, adj;
 
-       rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->locked_vm  + pgsz;
-       if (rlim < vm)
-               return NULL;
+       /* adjust the buffer address and size to meet alignment
+        * constraints:
+        * - buffer is double-word aligned
+        * - size is multiple of record size
+        *
+        * We checked the size at the very beginning; we have enough
+        * space to do the adjustment.
+        */
+       buffer = (unsigned long)base;
 
-       buffer = kzalloc(size, GFP_KERNEL);
-       if (!buffer)
-               return NULL;
+       adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
+       buffer += adj;
+       size   -= adj;
 
-       current->mm->total_vm  += pgsz;
-       current->mm->locked_vm += pgsz;
+       size /= ds_cfg.sizeof_rec[qual];
+       size *= ds_cfg.sizeof_rec[qual];
 
-       if (pages)
-               *pages = pgsz;
+       ds_set(context->ds, qual, ds_buffer_base, buffer);
+       ds_set(context->ds, qual, ds_index, buffer);
+       ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
 
-       return buffer;
+       /* The value for 'no threshold' is -1, which will set the
+        * threshold outside of the buffer, just like we want it.
+        */
+       ds_set(context->ds, qual,
+              ds_interrupt_threshold, buffer + size - ith);
 }
 
-static int ds_request(struct task_struct *task, void *base, size_t size,
-                     ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
+static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
+                     struct task_struct *task,
+                     void *base, size_t size, size_t th)
 {
        struct ds_context *context;
-       unsigned long buffer, adj;
-       const unsigned long alignment = (1 << 3);
        unsigned long irq;
-       int error = 0;
+       int error;
 
+       error = -EOPNOTSUPP;
        if (!ds_cfg.sizeof_ds)
-               return -EOPNOTSUPP;
+               goto out;
+
+       error = -EINVAL;
+       if (!base)
+               goto out;
 
        /* we require some space to do alignment adjustments below */
-       if (size < (alignment + ds_cfg.sizeof_rec[qual]))
-               return -EINVAL;
+       error = -EINVAL;
+       if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+               goto out;
 
-       /* buffer overflow notification is not yet implemented */
-       if (ovfl)
-               return -EOPNOTSUPP;
+       if (th != (size_t)-1) {
+               th *= ds_cfg.sizeof_rec[qual];
+
+               error = -EINVAL;
+               if (size <= th)
+                       goto out;
+       }
 
+       tracer->buffer = base;
+       tracer->size = size;
 
-       context = ds_alloc_context(task);
+       error = -ENOMEM;
+       context = ds_get_context(task);
        if (!context)
-               return -ENOMEM;
+               goto out;
+       tracer->context = context;
+
 
        spin_lock_irqsave(&ds_lock, irq);
 
        error = -EPERM;
        if (!check_tracer(task))
                goto out_unlock;
-
        get_tracer(task);
 
-       error = -EALREADY;
-       if (context->owner[qual] == current)
-               goto out_put_tracer;
        error = -EPERM;
-       if (context->owner[qual] != NULL)
+       if (context->owner[qual])
                goto out_put_tracer;
-       context->owner[qual] = current;
+       context->owner[qual] = tracer;
 
        spin_unlock_irqrestore(&ds_lock, irq);
 
 
-       error = -ENOMEM;
-       if (!base) {
-               base = ds_allocate_buffer(size, &context->pages[qual]);
-               if (!base)
-                       goto out_release;
-
-               context->buffer[qual]   = base;
-       }
-       error = 0;
+       ds_install_ds_config(context, qual, base, size, th);
 
-       context->callback[qual] = ovfl;
-
-       /* adjust the buffer address and size to meet alignment
-        * constraints:
-        * - buffer is double-word aligned
-        * - size is multiple of record size
-        *
-        * We checked the size at the very beginning; we have enough
-        * space to do the adjustment.
-        */
-       buffer = (unsigned long)base;
-
-       adj = ALIGN(buffer, alignment) - buffer;
-       buffer += adj;
-       size   -= adj;
-
-       size /= ds_cfg.sizeof_rec[qual];
-       size *= ds_cfg.sizeof_rec[qual];
-
-       ds_set(context->ds, qual, ds_buffer_base, buffer);
-       ds_set(context->ds, qual, ds_index, buffer);
-       ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
-
-       if (ovfl) {
-               /* todo: select a suitable interrupt threshold */
-       } else
-               ds_set(context->ds, qual,
-                      ds_interrupt_threshold, buffer + size + 1);
-
-       /* we keep the context until ds_release */
-       return error;
-
- out_release:
-       context->owner[qual] = NULL;
-       ds_put_context(context);
-       put_tracer(task);
-       return error;
+       return 0;
 
  out_put_tracer:
-       spin_unlock_irqrestore(&ds_lock, irq);
-       ds_put_context(context);
        put_tracer(task);
-       return error;
-
  out_unlock:
        spin_unlock_irqrestore(&ds_lock, irq);
        ds_put_context(context);
+       tracer->context = NULL;
+ out:
        return error;
 }
 
-int ds_request_bts(struct task_struct *task, void *base, size_t size,
-                  ds_ovfl_callback_t ovfl)
+struct bts_tracer *ds_request_bts(struct task_struct *task,
+                                 void *base, size_t size,
+                                 bts_ovfl_callback_t ovfl, size_t th)
 {
-       return ds_request(task, base, size, ovfl, ds_bts);
-}
+       struct bts_tracer *tracer;
+       int error;
 
-int ds_request_pebs(struct task_struct *task, void *base, size_t size,
-                   ds_ovfl_callback_t ovfl)
-{
-       return ds_request(task, base, size, ovfl, ds_pebs);
+       /* buffer overflow notification is not yet implemented */
+       error = -EOPNOTSUPP;
+       if (ovfl)
+               goto out;
+
+       error = -ENOMEM;
+       tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+       if (!tracer)
+               goto out;
+       tracer->ovfl = ovfl;
+
+       error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
+       if (error < 0)
+               goto out_tracer;
+
+       return tracer;
+
+ out_tracer:
+       kfree(tracer);
+ out:
+       return ERR_PTR(error);
 }
 
-static int ds_release(struct task_struct *task, enum ds_qualifier qual)
+struct pebs_tracer *ds_request_pebs(struct task_struct *task,
+                                   void *base, size_t size,
+                                   pebs_ovfl_callback_t ovfl, size_t th)
 {
-       struct ds_context *context;
+       struct pebs_tracer *tracer;
        int error;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
+       /* buffer overflow notification is not yet implemented */
+       error = -EOPNOTSUPP;
+       if (ovfl)
                goto out;
 
-       kfree(context->buffer[qual]);
-       context->buffer[qual] = NULL;
+       error = -ENOMEM;
+       tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+       if (!tracer)
+               goto out;
+       tracer->ovfl = ovfl;
 
-       current->mm->total_vm  -= context->pages[qual];
-       current->mm->locked_vm -= context->pages[qual];
-       context->pages[qual] = 0;
-       context->owner[qual] = NULL;
+       error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
+       if (error < 0)
+               goto out_tracer;
 
-       /*
-        * we put the context twice:
-        *   once for the ds_get_context
-        *   once for the corresponding ds_request
-        */
-       ds_put_context(context);
+       return tracer;
+
+ out_tracer:
+       kfree(tracer);
  out:
-       ds_put_context(context);
-       return error;
+       return ERR_PTR(error);
 }
 
-int ds_release_bts(struct task_struct *task)
+static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
 {
-       return ds_release(task, ds_bts);
+       BUG_ON(tracer->context->owner[qual] != tracer);
+       tracer->context->owner[qual] = NULL;
+
+       put_tracer(tracer->context->task);
+       ds_put_context(tracer->context);
 }
 
-int ds_release_pebs(struct task_struct *task)
+int ds_release_bts(struct bts_tracer *tracer)
 {
-       return ds_release(task, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_release(&tracer->ds, ds_bts);
+       kfree(tracer);
+
+       return 0;
 }
 
-static int ds_get_index(struct task_struct *task, size_t *pos,
-                       enum ds_qualifier qual)
+int ds_release_pebs(struct pebs_tracer *tracer)
 {
-       struct ds_context *context;
-       unsigned long base, index;
-       int error;
+       if (!tracer)
+               return -EINVAL;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
+       ds_release(&tracer->ds, ds_pebs);
+       kfree(tracer);
+
+       return 0;
+}
+
+static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
+{
+       unsigned long base, index;
 
        base  = ds_get(context->ds, qual, ds_buffer_base);
        index = ds_get(context->ds, qual, ds_index);
 
-       error = ((index - base) / ds_cfg.sizeof_rec[qual]);
-       if (pos)
-               *pos = error;
- out:
-       ds_put_context(context);
-       return error;
+       return (index - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_index(struct task_struct *task, size_t *pos)
+int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
 {
-       return ds_get_index(task, pos, ds_bts);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_index(tracer->ds.context, ds_bts);
+
+       return 0;
 }
 
-int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
 {
-       return ds_get_index(task, pos, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_index(tracer->ds.context, ds_pebs);
+
+       return 0;
 }
 
-static int ds_get_end(struct task_struct *task, size_t *pos,
-                     enum ds_qualifier qual)
+static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
 {
-       struct ds_context *context;
-       unsigned long base, end;
-       int error;
-
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
+       unsigned long base, max;
 
        base = ds_get(context->ds, qual, ds_buffer_base);
-       end  = ds_get(context->ds, qual, ds_absolute_maximum);
+       max  = ds_get(context->ds, qual, ds_absolute_maximum);
 
-       error = ((end - base) / ds_cfg.sizeof_rec[qual]);
-       if (pos)
-               *pos = error;
- out:
-       ds_put_context(context);
-       return error;
+       return (max - base) / ds_cfg.sizeof_rec[qual];
 }
 
-int ds_get_bts_end(struct task_struct *task, size_t *pos)
+int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
 {
-       return ds_get_end(task, pos, ds_bts);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_end(tracer->ds.context, ds_bts);
+
+       return 0;
 }
 
-int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
 {
-       return ds_get_end(task, pos, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       if (!pos)
+               return -EINVAL;
+
+       *pos = ds_get_end(tracer->ds.context, ds_pebs);
+
+       return 0;
 }
 
-static int ds_access(struct task_struct *task, size_t index,
-                    const void **record, enum ds_qualifier qual)
+static int ds_access(struct ds_context *context, enum ds_qualifier qual,
+                    size_t index, const void **record)
 {
-       struct ds_context *context;
        unsigned long base, idx;
-       int error;
 
        if (!record)
                return -EINVAL;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
-
        base = ds_get(context->ds, qual, ds_buffer_base);
        idx = base + (index * ds_cfg.sizeof_rec[qual]);
 
-       error = -EINVAL;
        if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
-               goto out;
+               return -EINVAL;
 
        *record = (const void *)idx;
-       error = ds_cfg.sizeof_rec[qual];
- out:
-       ds_put_context(context);
-       return error;
+
+       return ds_cfg.sizeof_rec[qual];
 }
 
-int ds_access_bts(struct task_struct *task, size_t index, const void **record)
+int ds_access_bts(struct bts_tracer *tracer, size_t index,
+                 const void **record)
 {
-       return ds_access(task, index, record, ds_bts);
+       if (!tracer)
+               return -EINVAL;
+
+       return ds_access(tracer->ds.context, ds_bts, index, record);
 }
 
-int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
+int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
+                  const void **record)
 {
-       return ds_access(task, index, record, ds_pebs);
+       if (!tracer)
+               return -EINVAL;
+
+       return ds_access(tracer->ds.context, ds_pebs, index, record);
 }
 
-static int ds_write(struct task_struct *task, const void *record, size_t size,
-                   enum ds_qualifier qual, int force)
+static int ds_write(struct ds_context *context, enum ds_qualifier qual,
+                   const void *record, size_t size)
 {
-       struct ds_context *context;
-       int error;
+       int bytes_written = 0;
 
        if (!record)
                return -EINVAL;
 
-       error = -EPERM;
-       context = ds_get_context(task);
-       if (!context)
-               goto out;
-
-       if (!force) {
-               error = ds_validate_access(context, qual);
-               if (error < 0)
-                       goto out;
-       }
-
-       error = 0;
        while (size) {
                unsigned long base, index, end, write_end, int_th;
                unsigned long write_size, adj_write_size;
@@ -678,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
                        write_end = end;
 
                if (write_end <= index)
-                       goto out;
+                       break;
 
                write_size = min((unsigned long) size, write_end - index);
                memcpy((void *)index, record, write_size);
 
                record = (const char *)record + write_size;
-               size  -= write_size;
-               error += write_size;
+               size -= write_size;
+               bytes_written += write_size;
 
                adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
                adj_write_size *= ds_cfg.sizeof_rec[qual];
@@ -700,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
                ds_set(context->ds, qual, ds_index, index);
 
                if (index >= int_th)
-                       ds_overflow(task, context, qual);
+                       ds_overflow(context, qual);
        }
 
- out:
-       ds_put_context(context);
-       return error;
+       return bytes_written;
 }
 
-int ds_write_bts(struct task_struct *task, const void *record, size_t size)
+int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
 {
-       return ds_write(task, record, size, ds_bts, /* force = */ 0);
-}
+       if (!tracer)
+               return -EINVAL;
 
-int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
-{
-       return ds_write(task, record, size, ds_pebs, /* force = */ 0);
+       return ds_write(tracer->ds.context, ds_bts, record, size);
 }
 
-int ds_unchecked_write_bts(struct task_struct *task,
-                          const void *record, size_t size)
+int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
 {
-       return ds_write(task, record, size, ds_bts, /* force = */ 1);
-}
+       if (!tracer)
+               return -EINVAL;
 
-int ds_unchecked_write_pebs(struct task_struct *task,
-                           const void *record, size_t size)
-{
-       return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+       return ds_write(tracer->ds.context, ds_pebs, record, size);
 }
 
-static int ds_reset_or_clear(struct task_struct *task,
-                            enum ds_qualifier qual, int clear)
+static void ds_reset_or_clear(struct ds_context *context,
+                             enum ds_qualifier qual, int clear)
 {
-       struct ds_context *context;
        unsigned long base, end;
-       int error;
-
-       context = ds_get_context(task);
-       error = ds_validate_access(context, qual);
-       if (error < 0)
-               goto out;
 
        base = ds_get(context->ds, qual, ds_buffer_base);
        end  = ds_get(context->ds, qual, ds_absolute_maximum);
@@ -749,70 +681,69 @@ static int ds_reset_or_clear(struct task_struct *task,
                memset((void *)base, 0, end - base);
 
        ds_set(context->ds, qual, ds_index, base);
-
-       error = 0;
- out:
-       ds_put_context(context);
-       return error;
 }
 
-int ds_reset_bts(struct task_struct *task)
+int ds_reset_bts(struct bts_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
+
+       return 0;
 }
 
-int ds_reset_pebs(struct task_struct *task)
+int ds_reset_pebs(struct pebs_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
+
+       return 0;
 }
 
-int ds_clear_bts(struct task_struct *task)
+int ds_clear_bts(struct bts_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
+
+       return 0;
 }
 
-int ds_clear_pebs(struct task_struct *task)
+int ds_clear_pebs(struct pebs_tracer *tracer)
 {
-       return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+       if (!tracer)
+               return -EINVAL;
+
+       ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
+
+       return 0;
 }
 
-int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
 {
-       struct ds_context *context;
-       int error;
+       if (!tracer)
+               return -EINVAL;
 
        if (!value)
                return -EINVAL;
 
-       context = ds_get_context(task);
-       error = ds_validate_access(context, ds_pebs);
-       if (error < 0)
-               goto out;
+       *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
 
-       *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
-
-       error = 0;
- out:
-       ds_put_context(context);
-       return error;
+       return 0;
 }
 
-int ds_set_pebs_reset(struct task_struct *task, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
 {
-       struct ds_context *context;
-       int error;
-
-       context = ds_get_context(task);
-       error = ds_validate_access(context, ds_pebs);
-       if (error < 0)
-               goto out;
+       if (!tracer)
+               return -EINVAL;
 
-       *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+       *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
 
-       error = 0;
- out:
-       ds_put_context(context);
-       return error;
+       return 0;
 }
 
 static const struct ds_configuration ds_cfg_var = {
@@ -840,6 +771,10 @@ static inline void
 ds_configure(const struct ds_configuration *cfg)
 {
        ds_cfg = *cfg;
+
+       printk(KERN_INFO "DS available\n");
+
+       BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -847,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
        switch (c->x86) {
        case 0x6:
                switch (c->x86_model) {
+               case 0 ... 0xC:
+                       /* sorry, don't know about them */
+                       break;
                case 0xD:
                case 0xE: /* Pentium M */
                        ds_configure(&ds_cfg_var);
                        break;
-               case 0xF: /* Core2 */
-               case 0x1C: /* Atom */
+               default: /* Core2, Atom, ... */
                        ds_configure(&ds_cfg_64);
                        break;
-               default:
-                       /* sorry, don't know about them */
-                       break;
                }
                break;
        case 0xF:
@@ -884,6 +818,8 @@ void ds_free(struct ds_context *context)
         * is dying. There should not be any user of that context left
         * to disturb us, anymore. */
        unsigned long leftovers = context->count;
-       while (leftovers--)
+       while (leftovers--) {
+               put_tracer(context->task);
                ds_put_context(context);
+       }
 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644 (file)
index 0000000..6b1f6f6
--- /dev/null
@@ -0,0 +1,351 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#include "dumpstack.h"
+
+int panic_on_unrecovered_nmi;
+unsigned int code_bytes = 64;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+       printk(" [<%p>] %s%pS\n", (void *) address,
+                       reliable ? "" : "? ", (void *) address);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+                       const struct stacktrace_ops *ops,
+                       struct thread_info *tinfo, int *graph)
+{
+       struct task_struct *task = tinfo->task;
+       unsigned long ret_addr;
+       int index = task->curr_ret_stack;
+
+       if (addr != (unsigned long)return_to_handler)
+               return;
+
+       if (!task->ret_stack || index < *graph)
+               return;
+
+       index -= *graph;
+       ret_addr = task->ret_stack[index].ret;
+
+       ops->address(data, ret_addr, 1);
+
+       (*graph)++;
+}
+#else
+static inline void
+print_ftrace_graph_addr(unsigned long addr, void *data,
+                       const struct stacktrace_ops *ops,
+                       struct thread_info *tinfo, int *graph)
+{ }
+#endif
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+                       void *p, unsigned int size, void *end)
+{
+       void *t = tinfo;
+       if (end) {
+               if (p < end && p >= (end-THREAD_SIZE))
+                       return 1;
+               else
+                       return 0;
+       }
+       return p > t && p < t + THREAD_SIZE - size;
+}
+
+unsigned long
+print_context_stack(struct thread_info *tinfo,
+               unsigned long *stack, unsigned long bp,
+               const struct stacktrace_ops *ops, void *data,
+               unsigned long *end, int *graph)
+{
+       struct stack_frame *frame = (struct stack_frame *)bp;
+
+       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+               unsigned long addr;
+
+               addr = *stack;
+               if (__kernel_text_address(addr)) {
+                       if ((unsigned long) stack == bp + sizeof(long)) {
+                               ops->address(data, addr, 1);
+                               frame = frame->next_frame;
+                               bp = (unsigned long) frame;
+                       } else {
+                               ops->address(data, addr, bp == 0);
+                       }
+                       print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
+               }
+               stack++;
+       }
+       return bp;
+}
+
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+       printk(data);
+       print_symbol(msg, symbol);
+       printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+       printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+       printk("%s <%s> ", (char *)data, name);
+       return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+       touch_nmi_watchdog();
+       printk(data);
+       printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+       .warning = print_trace_warning,
+       .warning_symbol = print_trace_warning_symbol,
+       .stack = print_trace_stack,
+       .address = print_trace_address,
+};
+
+void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+       printk("%sCall Trace:\n", log_lvl);
+       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp)
+{
+       show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+       show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+       unsigned long bp = 0;
+       unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+       if (!bp)
+               get_bp(bp);
+#endif
+
+       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+               current->pid, current->comm, print_tainted(),
+               init_utsname()->release,
+               (int)strcspn(init_utsname()->version, " "),
+               init_utsname()->version);
+       show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+       int cpu;
+       unsigned long flags;
+
+       oops_enter();
+
+       /* racy, but better than risking deadlock. */
+       raw_local_irq_save(flags);
+       cpu = smp_processor_id();
+       if (!__raw_spin_trylock(&die_lock)) {
+               if (cpu == die_owner)
+                       /* nested oops. should stop eventually */;
+               else
+                       __raw_spin_lock(&die_lock);
+       }
+       die_nest_count++;
+       die_owner = cpu;
+       console_verbose();
+       bust_spinlocks(1);
+       return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+       if (regs && kexec_should_crash(current))
+               crash_kexec(regs);
+
+       bust_spinlocks(0);
+       die_owner = -1;
+       add_taint(TAINT_DIE);
+       die_nest_count--;
+       if (!die_nest_count)
+               /* Nest count reaches zero, release the lock. */
+               __raw_spin_unlock(&die_lock);
+       raw_local_irq_restore(flags);
+       oops_exit();
+
+       if (!signr)
+               return;
+       if (in_interrupt())
+               panic("Fatal exception in interrupt");
+       if (panic_on_oops)
+               panic("Fatal exception");
+       do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+#ifdef CONFIG_X86_32
+       unsigned short ss;
+       unsigned long sp;
+#endif
+       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+       printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+       printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       printk("DEBUG_PAGEALLOC");
+#endif
+       printk("\n");
+       sysfs_printk_last_file();
+       if (notify_die(DIE_OOPS, str, regs, err,
+                       current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+               return 1;
+
+       show_registers(regs);
+#ifdef CONFIG_X86_32
+       sp = (unsigned long) (&regs->sp);
+       savesegment(ss, ss);
+       if (user_mode(regs)) {
+               sp = regs->sp;
+               ss = regs->ss & 0xffff;
+       }
+       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+       print_symbol("%s", regs->ip);
+       printk(" SS:ESP %04x:%08lx\n", ss, sp);
+#else
+       /* Executive summary in case the oops scrolled away */
+       printk(KERN_ALERT "RIP ");
+       printk_address(regs->ip, 1);
+       printk(" RSP <%016lx>\n", regs->sp);
+#endif
+       return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+       unsigned long flags = oops_begin();
+       int sig = SIGSEGV;
+
+       if (!user_mode_vm(regs))
+               report_bug(regs->ip, regs);
+
+       if (__die(str, regs, err))
+               sig = 0;
+       oops_end(flags, regs, sig);
+}
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+       unsigned long flags;
+
+       if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+               return;
+
+       /*
+        * We are in trouble anyway, lets at least try
+        * to get a message out.
+        */
+       flags = oops_begin();
+       printk(KERN_EMERG "%s", str);
+       printk(" on CPU%d, ip %08lx, registers:\n",
+               smp_processor_id(), regs->ip);
+       show_registers(regs);
+       oops_end(flags, regs, 0);
+       if (do_panic || panic_on_oops)
+               panic("Non maskable interrupt");
+       nmi_exit();
+       local_irq_enable();
+       do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+       if (!s)
+               return -EINVAL;
+       if (!strcmp(s, "panic"))
+               panic_on_oops = 1;
+       return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+       if (!s)
+               return -EINVAL;
+       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+       return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+       code_bytes = simple_strtoul(s, NULL, 0);
+       if (code_bytes > 8192)
+               code_bytes = 8192;
+
+       return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644 (file)
index 0000000..da87590
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+
+#ifndef DUMPSTACK_H
+#define DUMPSTACK_H
+
+#ifdef CONFIG_X86_32
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+#else
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+#endif
+
+extern unsigned long
+print_context_stack(struct thread_info *tinfo,
+               unsigned long *stack, unsigned long bp,
+               const struct stacktrace_ops *ops, void *data,
+               unsigned long *end, int *graph);
+
+extern void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *stack, unsigned long bp, char *log_lvl);
+
+extern void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+               unsigned long *sp, unsigned long bp, char *log_lvl);
+
+extern unsigned int code_bytes;
+extern int kstack_depth_to_print;
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+       struct stack_frame *next_frame;
+       unsigned long return_address;
+};
+#endif
index b3614752197b6c6e3c0cf53c7b718dd0167fbdea..d593cd1f58dcca22925f10e2b3669435a4a25557 100644 (file)
 
 #include <asm/stacktrace.h>
 
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-       printk(" [<%p>] %s%pS\n", (void *) address,
-                       reliable ? "" : "? ", (void *) address);
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-                       void *p, unsigned int size, void *end)
-{
-       void *t = tinfo;
-       if (end) {
-               if (p < end && p >= (end-THREAD_SIZE))
-                       return 1;
-               else
-                       return 0;
-       }
-       return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-       struct stack_frame *next_frame;
-       unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data,
-               unsigned long *end)
-{
-       struct stack_frame *frame = (struct stack_frame *)bp;
-
-       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
-               unsigned long addr;
-
-               addr = *stack;
-               if (__kernel_text_address(addr)) {
-                       if ((unsigned long) stack == bp + sizeof(long)) {
-                               ops->address(data, addr, 1);
-                               frame = frame->next_frame;
-                               bp = (unsigned long) frame;
-                       } else {
-                               ops->address(data, addr, bp == 0);
-                       }
-               }
-               stack++;
-       }
-       return bp;
-}
+#include "dumpstack.h"
 
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
                unsigned long *stack, unsigned long bp,
                const struct stacktrace_ops *ops, void *data)
 {
+       int graph = 0;
+
        if (!task)
                task = current;
 
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 
                context = (struct thread_info *)
                        ((unsigned long)stack & (~(THREAD_SIZE - 1)));
-               bp = print_context_stack(context, stack, bp, ops, data, NULL);
+               bp = print_context_stack(context, stack, bp, ops,
+                                        data, NULL, &graph);
 
                stack = (unsigned long *)context->previous_esp;
                if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 }
 EXPORT_SYMBOL(dump_trace);
 
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-       printk(data);
-       print_symbol(msg, symbol);
-       printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-       printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-       printk("%s <%s> ", (char *)data, name);
-       return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-       touch_nmi_watchdog();
-       printk(data);
-       printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-       .warning = print_trace_warning,
-       .warning_symbol = print_trace_warning_symbol,
-       .stack = print_trace_stack,
-       .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl)
-{
-       printk("%sCall Trace:\n", log_lvl);
-       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
-{
-       show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                unsigned long *sp, unsigned long bp, char *log_lvl)
 {
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
-       show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-       unsigned long bp = 0;
-       unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp)
-               get_bp(bp);
-#endif
-
-       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-               current->pid, current->comm, print_tainted(),
-               init_utsname()->release,
-               (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
-       show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
 
 void show_registers(struct pt_regs *regs)
 {
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
        return ud2 == 0x0b0f;
 }
 
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-       unsigned long flags;
-
-       oops_enter();
-
-       if (die_owner != raw_smp_processor_id()) {
-               console_verbose();
-               raw_local_irq_save(flags);
-               __raw_spin_lock(&die_lock);
-               die_owner = smp_processor_id();
-               die_nest_count = 0;
-               bust_spinlocks(1);
-       } else {
-               raw_local_irq_save(flags);
-       }
-       die_nest_count++;
-       return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
-       bust_spinlocks(0);
-       die_owner = -1;
-       add_taint(TAINT_DIE);
-       __raw_spin_unlock(&die_lock);
-       raw_local_irq_restore(flags);
-
-       if (!regs)
-               return;
-
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-       if (panic_on_oops)
-               panic("Fatal exception");
-       oops_exit();
-       do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-       unsigned short ss;
-       unsigned long sp;
-
-       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-       printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-       printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       printk("DEBUG_PAGEALLOC");
-#endif
-       printk("\n");
-       sysfs_printk_last_file();
-       if (notify_die(DIE_OOPS, str, regs, err,
-                       current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
-               return 1;
-
-       show_registers(regs);
-       /* Executive summary in case the oops scrolled away */
-       sp = (unsigned long) (&regs->sp);
-       savesegment(ss, ss);
-       if (user_mode(regs)) {
-               sp = regs->sp;
-               ss = regs->ss & 0xffff;
-       }
-       printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-       print_symbol("%s", regs->ip);
-       printk(" SS:ESP %04x:%08lx\n", ss, sp);
-       return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
-       unsigned long flags = oops_begin();
-
-       if (die_nest_count < 3) {
-               report_bug(regs->ip, regs);
-
-               if (__die(str, regs, err))
-                       regs = NULL;
-       } else {
-               printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
-       }
-
-       oops_end(flags, regs, SIGSEGV);
-}
-
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-       if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-               return;
-
-       spin_lock(&nmi_print_lock);
-       /*
-       * We are in trouble anyway, lets at least try
-       * to get a message out:
-       */
-       bust_spinlocks(1);
-       printk(KERN_EMERG "%s", str);
-       printk(" on CPU%d, ip %08lx, registers:\n",
-               smp_processor_id(), regs->ip);
-       show_registers(regs);
-       if (do_panic)
-               panic("Non maskable interrupt");
-       console_silent();
-       spin_unlock(&nmi_print_lock);
-
-       /*
-        * If we are in kernel we are probably nested up pretty bad
-        * and might aswell get out now while we still can:
-        */
-       if (!user_mode_vm(regs)) {
-               current->thread.trap_no = 2;
-               crash_kexec(regs);
-       }
-
-       bust_spinlocks(0);
-       do_exit(SIGSEGV);
-}
-
-static int __init oops_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       if (!strcmp(s, "panic"))
-               panic_on_oops = 1;
-       return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-       return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-       code_bytes = simple_strtoul(s, NULL, 0);
-       if (code_bytes > 8192)
-               code_bytes = 8192;
-
-       return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
index 96a5db7da8a747e5192f410c60fe9494e8d8e8d1..c302d070704826c44f0f4f8434383594e578cbe8 100644 (file)
 
 #include <asm/stacktrace.h>
 
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
-static unsigned int code_bytes = 64;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-       printk(" [<%p>] %s%pS\n", (void *) address,
-                       reliable ? "" : "? ", (void *) address);
-}
+#include "dumpstack.h"
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
                                        unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-                       void *p, unsigned int size, void *end)
-{
-       void *t = tinfo;
-       if (end) {
-               if (p < end && p >= (end-THREAD_SIZE))
-                       return 1;
-               else
-                       return 0;
-       }
-       return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-       struct stack_frame *next_frame;
-       unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-               unsigned long *stack, unsigned long bp,
-               const struct stacktrace_ops *ops, void *data,
-               unsigned long *end)
-{
-       struct stack_frame *frame = (struct stack_frame *)bp;
-
-       while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
-               unsigned long addr;
-
-               addr = *stack;
-               if (__kernel_text_address(addr)) {
-                       if ((unsigned long) stack == bp + sizeof(long)) {
-                               ops->address(data, addr, 1);
-                               frame = frame->next_frame;
-                               bp = (unsigned long) frame;
-                       } else {
-                               ops->address(data, addr, bp == 0);
-                       }
-               }
-               stack++;
-       }
-       return bp;
-}
-
 void dump_trace(struct task_struct *task, struct pt_regs *regs,
                unsigned long *stack, unsigned long bp,
                const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
        unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
        unsigned used = 0;
        struct thread_info *tinfo;
+       int graph = 0;
 
        if (!task)
                task = current;
@@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                                break;
 
                        bp = print_context_stack(tinfo, stack, bp, ops,
-                                                       data, estack_end);
+                                                data, estack_end, &graph);
                        ops->stack(data, "<EOE>");
                        /*
                         * We link to the next stack via the
@@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
                                if (ops->stack(data, "IRQ") < 0)
                                        break;
                                bp = print_context_stack(tinfo, stack, bp,
-                                               ops, data, irqstack_end);
+                                       ops, data, irqstack_end, &graph);
                                /*
                                 * We link to the next stack (which would be
                                 * the process stack normally) the last
@@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
        /*
         * This handles the process stack:
         */
-       bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+       bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
        put_cpu();
 }
 EXPORT_SYMBOL(dump_trace);
 
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-       printk(data);
-       print_symbol(msg, symbol);
-       printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-       printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-       printk("%s <%s> ", (char *)data, name);
-       return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-       touch_nmi_watchdog();
-       printk(data);
-       printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-       .warning = print_trace_warning,
-       .warning_symbol = print_trace_warning_symbol,
-       .stack = print_trace_stack,
-       .address = print_trace_address,
-};
-
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp, char *log_lvl)
-{
-       printk("%sCall Trace:\n", log_lvl);
-       dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-}
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-               unsigned long *stack, unsigned long bp)
-{
-       show_trace_log_lvl(task, regs, stack, bp, "");
-}
-
-static void
+void
 show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                unsigned long *sp, unsigned long bp, char *log_lvl)
 {
@@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
        show_trace_log_lvl(task, regs, sp, bp, log_lvl);
 }
 
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
-       show_stack_log_lvl(task, NULL, sp, 0, "");
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-       unsigned long bp = 0;
-       unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-       if (!bp)
-               get_bp(bp);
-#endif
-
-       printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-               current->pid, current->comm, print_tainted(),
-               init_utsname()->release,
-               (int)strcspn(init_utsname()->version, " "),
-               init_utsname()->version);
-       show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void show_registers(struct pt_regs *regs)
 {
        int i;
@@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
        return ud2 == 0x0b0f;
 }
 
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-       int cpu;
-       unsigned long flags;
-
-       oops_enter();
-
-       /* racy, but better than risking deadlock. */
-       raw_local_irq_save(flags);
-       cpu = smp_processor_id();
-       if (!__raw_spin_trylock(&die_lock)) {
-               if (cpu == die_owner)
-                       /* nested oops. should stop eventually */;
-               else
-                       __raw_spin_lock(&die_lock);
-       }
-       die_nest_count++;
-       die_owner = cpu;
-       console_verbose();
-       bust_spinlocks(1);
-       return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{
-       die_owner = -1;
-       bust_spinlocks(0);
-       die_nest_count--;
-       if (!die_nest_count)
-               /* Nest count reaches zero, release the lock. */
-               __raw_spin_unlock(&die_lock);
-       raw_local_irq_restore(flags);
-       if (!regs) {
-               oops_exit();
-               return;
-       }
-       if (in_interrupt())
-               panic("Fatal exception in interrupt");
-       if (panic_on_oops)
-               panic("Fatal exception");
-       oops_exit();
-       do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-       printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-       printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-       printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-       printk("DEBUG_PAGEALLOC");
-#endif
-       printk("\n");
-       sysfs_printk_last_file();
-       if (notify_die(DIE_OOPS, str, regs, err,
-                       current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
-               return 1;
-
-       show_registers(regs);
-       add_taint(TAINT_DIE);
-       /* Executive summary in case the oops scrolled away */
-       printk(KERN_ALERT "RIP ");
-       printk_address(regs->ip, 1);
-       printk(" RSP <%016lx>\n", regs->sp);
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-       return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
-       unsigned long flags = oops_begin();
-
-       if (!user_mode(regs))
-               report_bug(regs->ip, regs);
-
-       if (__die(str, regs, err))
-               regs = NULL;
-       oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-       unsigned long flags;
-
-       if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-               return;
-
-       flags = oops_begin();
-       /*
-        * We are in trouble anyway, lets at least try
-        * to get a message out.
-        */
-       printk(KERN_EMERG "%s", str);
-       printk(" on CPU%d, ip %08lx, registers:\n",
-               smp_processor_id(), regs->ip);
-       show_registers(regs);
-       if (kexec_should_crash(current))
-               crash_kexec(regs);
-       if (do_panic || panic_on_oops)
-               panic("Non maskable interrupt");
-       oops_end(flags, NULL, SIGBUS);
-       nmi_exit();
-       local_irq_enable();
-       do_exit(SIGBUS);
-}
-
-static int __init oops_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       if (!strcmp(s, "panic"))
-               panic_on_oops = 1;
-       return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-       if (!s)
-               return -EINVAL;
-       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-       return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-       code_bytes = simple_strtoul(s, NULL, 0);
-       if (code_bytes > 8192)
-               code_bytes = 8192;
-
-       return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
index 28b597ef9ca16b7992c333f10eae252ea695475c..43ceb3f454bfce00386eeaaf377014871272a809 100644 (file)
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
        pushl %eax
        pushl %ecx
        pushl %edx
@@ -1171,6 +1174,11 @@ ftrace_call:
        popl %edx
        popl %ecx
        popl %eax
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+       jmp ftrace_stub
+#endif
 
 .globl ftrace_stub
 ftrace_stub:
@@ -1180,8 +1188,18 @@ END(ftrace_caller)
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 
 ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
        cmpl $ftrace_stub, ftrace_trace_function
        jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpl $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+
+       cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+#endif
 .globl ftrace_stub
 ftrace_stub:
        ret
@@ -1200,12 +1218,43 @@ trace:
        popl %edx
        popl %ecx
        popl %eax
-
        jmp ftrace_stub
 END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       movl 0xc(%esp), %edx
+       lea 0x4(%ebp), %eax
+       subl $MCOUNT_INSN_SIZE, %edx
+       call prepare_ftrace_return
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+       pushl $0
+       pushl %eax
+       pushl %ecx
+       pushl %edx
+       call ftrace_return_to_handler
+       movl %eax, 0xc(%esp)
+       popl %edx
+       popl %ecx
+       popl %eax
+       ret
+#endif
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
index b86f332c96a66596f0fe076d7c0eda78ea05dbe5..54e0bbdccb994d9b1b87747158487e4817ccde81 100644 (file)
@@ -68,6 +68,8 @@ ENTRY(mcount)
 END(mcount)
 
 ENTRY(ftrace_caller)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
 
        /* taken from glibc */
        subq $0x38, %rsp
@@ -96,6 +98,12 @@ ftrace_call:
        movq (%rsp), %rax
        addq $0x38, %rsp
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+       jmp ftrace_stub
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
        retq
@@ -103,8 +111,20 @@ END(ftrace_caller)
 
 #else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
+       cmpl $0, function_trace_stop
+       jne  ftrace_stub
+
        cmpq $ftrace_stub, ftrace_trace_function
        jnz trace
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       cmpq $ftrace_stub, ftrace_graph_return
+       jnz ftrace_graph_caller
+
+       cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
+       jnz ftrace_graph_caller
+#endif
+
 .globl ftrace_stub
 ftrace_stub:
        retq
@@ -140,6 +160,69 @@ END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FUNCTION_TRACER */
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+       cmpl $0, function_trace_stop
+       jne ftrace_stub
+
+       subq $0x38, %rsp
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+
+       leaq 8(%rbp), %rdi
+       movq 0x38(%rsp), %rsi
+       subq $MCOUNT_INSN_SIZE, %rsi
+
+       call    prepare_ftrace_return
+
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $0x38, %rsp
+       retq
+END(ftrace_graph_caller)
+
+
+.globl return_to_handler
+return_to_handler:
+       subq  $80, %rsp
+
+       movq %rax, (%rsp)
+       movq %rcx, 8(%rsp)
+       movq %rdx, 16(%rsp)
+       movq %rsi, 24(%rsp)
+       movq %rdi, 32(%rsp)
+       movq %r8, 40(%rsp)
+       movq %r9, 48(%rsp)
+       movq %r10, 56(%rsp)
+       movq %r11, 64(%rsp)
+
+       call ftrace_return_to_handler
+
+       movq %rax, 72(%rsp)
+       movq 64(%rsp), %r11
+       movq 56(%rsp), %r10
+       movq 48(%rsp), %r9
+       movq 40(%rsp), %r8
+       movq 32(%rsp), %rdi
+       movq 24(%rsp), %rsi
+       movq 16(%rsp), %rdx
+       movq 8(%rsp), %rcx
+       movq (%rsp), %rax
+       addq $72, %rsp
+       retq
+#endif
+
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif 
index 0aa2c443d600c64dcf5b4ac440c18bb4f3d0e9d9..53699c931ad41ebdaee8594cdb17f4464fcc986f 100644 (file)
 #include <asm/io.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
+#include <asm/atomic.h>
 #include <asm/apicdef.h>
 #include <mach_mpparse.h>
+#include <asm/genapic.h>
+#include <asm/setup.h>
 
 /*
  * ES7000 chipsets
@@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi)
        return gsi;
 }
 
+static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+{
+       unsigned long vect = 0, psaival = 0;
+
+       if (psai == NULL)
+               return -1;
+
+       vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+       psaival = (0x1000000 | vect | cpu);
+
+       while (*psai & 0x1000000)
+               ;
+
+       *psai = psaival;
+
+       return 0;
+}
+
+static void noop_wait_for_deassert(atomic_t *deassert_not_used)
+{
+}
+
+static int __init es7000_update_genapic(void)
+{
+       genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+
+       /* MPENTIUMIII */
+       if (boot_cpu_data.x86 == 6 &&
+           (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
+               es7000_update_genapic_to_cluster();
+               genapic->wait_for_init_deassert = noop_wait_for_deassert;
+               genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
+       }
+
+       return 0;
+}
+
 void __init
 setup_unisys(void)
 {
@@ -176,6 +216,8 @@ setup_unisys(void)
        else
                es7000_plat = ES7000_CLASSIC;
        ioapic_renumber_irq = es7000_rename_gsi;
+
+       x86_quirks->update_genapic = es7000_update_genapic;
 }
 
 /*
@@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg)
        return status;
 }
 
-int
-es7000_start_cpu(int cpu, unsigned long eip)
-{
-       unsigned long vect = 0, psaival = 0;
-
-       if (psai == NULL)
-               return -1;
-
-       vect = ((unsigned long)__pa(eip)/0x1000) << 16;
-       psaival = (0x1000000 | vect | cpu);
-
-       while (*psai & 0x1000000)
-                ;
-
-       *psai = psaival;
-
-       return 0;
-
-}
-
 void __init
 es7000_sw_apic(void)
 {
index 50ea0ac8c9bf2c27a53323b93b5d473bd7e1d028..1b43086b097a8489a85adcd41a3336fb0cde4f88 100644 (file)
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
 
 #include <asm/ftrace.h>
+#include <linux/ftrace.h>
 #include <asm/nops.h>
+#include <asm/nmi.h>
 
 
-static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+#ifdef CONFIG_DYNAMIC_FTRACE
 
 union ftrace_code_union {
        char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
        } __attribute__((packed));
 };
 
-
 static int ftrace_calc_offset(long ip, long addr)
 {
        return (int)(addr - ip);
 }
 
-unsigned char *ftrace_nop_replace(void)
-{
-       return ftrace_nop;
-}
-
-unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 {
        static union ftrace_code_union calc;
 
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
        return calc.code;
 }
 
-int
+/*
+ * Modifying code must take extra care. On an SMP machine, if
+ * the code being modified is also being executed on another CPU
+ * that CPU will have undefined results and possibly take a GPF.
+ * We use kstop_machine to stop other CPUS from exectuing code.
+ * But this does not stop NMIs from happening. We still need
+ * to protect against that. We separate out the modification of
+ * the code to take care of this.
+ *
+ * Two buffers are added: An IP buffer and a "code" buffer.
+ *
+ * 1) Put the instruction pointer into the IP buffer
+ *    and the new code into the "code" buffer.
+ * 2) Set a flag that says we are modifying code
+ * 3) Wait for any running NMIs to finish.
+ * 4) Write the code
+ * 5) clear the flag.
+ * 6) Wait for any running NMIs to finish.
+ *
+ * If an NMI is executed, the first thing it does is to call
+ * "ftrace_nmi_enter". This will check if the flag is set to write
+ * and if it is, it will write what is in the IP and "code" buffers.
+ *
+ * The trick is, it does not matter if everyone is writing the same
+ * content to the code location. Also, if a CPU is executing code
+ * it is OK to write to that code location if the contents being written
+ * are the same as what exists.
+ */
+
+static atomic_t in_nmi = ATOMIC_INIT(0);
+static int mod_code_status;            /* holds return value of text write */
+static int mod_code_write;             /* set when NMI should do the write */
+static void *mod_code_ip;              /* holds the IP to write to */
+static void *mod_code_newcode;         /* holds the text to write to the IP */
+
+static unsigned nmi_wait_count;
+static atomic_t nmi_update_count = ATOMIC_INIT(0);
+
+int ftrace_arch_read_dyn_info(char *buf, int size)
+{
+       int r;
+
+       r = snprintf(buf, size, "%u %u",
+                    nmi_wait_count,
+                    atomic_read(&nmi_update_count));
+       return r;
+}
+
+static void ftrace_mod_code(void)
+{
+       /*
+        * Yes, more than one CPU process can be writing to mod_code_status.
+        *    (and the code itself)
+        * But if one were to fail, then they all should, and if one were
+        * to succeed, then they all should.
+        */
+       mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
+                                            MCOUNT_INSN_SIZE);
+}
+
+void ftrace_nmi_enter(void)
+{
+       atomic_inc(&in_nmi);
+       /* Must have in_nmi seen before reading write flag */
+       smp_mb();
+       if (mod_code_write) {
+               ftrace_mod_code();
+               atomic_inc(&nmi_update_count);
+       }
+}
+
+void ftrace_nmi_exit(void)
+{
+       /* Finish all executions before clearing in_nmi */
+       smp_wmb();
+       atomic_dec(&in_nmi);
+}
+
+static void wait_for_nmi(void)
+{
+       int waited = 0;
+
+       while (atomic_read(&in_nmi)) {
+               waited = 1;
+               cpu_relax();
+       }
+
+       if (waited)
+               nmi_wait_count++;
+}
+
+static int
+do_ftrace_mod_code(unsigned long ip, void *new_code)
+{
+       mod_code_ip = (void *)ip;
+       mod_code_newcode = new_code;
+
+       /* The buffers need to be visible before we let NMIs write them */
+       smp_wmb();
+
+       mod_code_write = 1;
+
+       /* Make sure write bit is visible before we wait on NMIs */
+       smp_mb();
+
+       wait_for_nmi();
+
+       /* Make sure all running NMIs have finished before we write the code */
+       smp_mb();
+
+       ftrace_mod_code();
+
+       /* Make sure the write happens before clearing the bit */
+       smp_wmb();
+
+       mod_code_write = 0;
+
+       /* make sure NMIs see the cleared bit */
+       smp_mb();
+
+       wait_for_nmi();
+
+       return mod_code_status;
+}
+
+
+
+
+static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
+
+static unsigned char *ftrace_nop_replace(void)
+{
+       return ftrace_nop;
+}
+
+static int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
                   unsigned char *new_code)
 {
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
                return -EINVAL;
 
        /* replace the text with the new text */
-       if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
+       if (do_ftrace_mod_code(ip, new_code))
                return -EPERM;
 
        sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
        return 0;
 }
 
+int ftrace_make_nop(struct module *mod,
+                   struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *new, *old;
+       unsigned long ip = rec->ip;
+
+       old = ftrace_call_replace(ip, addr);
+       new = ftrace_nop_replace();
+
+       return ftrace_modify_code(rec->ip, old, new);
+}
+
+int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
+{
+       unsigned char *new, *old;
+       unsigned long ip = rec->ip;
+
+       old = ftrace_nop_replace();
+       new = ftrace_call_replace(ip, addr);
+
+       return ftrace_modify_code(rec->ip, old, new);
+}
+
 int ftrace_update_ftrace_func(ftrace_func_t func)
 {
        unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
 
        return 0;
 }
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+extern void ftrace_graph_call(void);
+
+static int ftrace_mod_jmp(unsigned long ip,
+                         int old_offset, int new_offset)
+{
+       unsigned char code[MCOUNT_INSN_SIZE];
+
+       if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
+               return -EFAULT;
+
+       if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
+               return -EINVAL;
+
+       *(int *)(&code[1]) = new_offset;
+
+       if (do_ftrace_mod_code(ip, &code))
+               return -EPERM;
+
+       return 0;
+}
+
+int ftrace_enable_ftrace_graph_caller(void)
+{
+       unsigned long ip = (unsigned long)(&ftrace_graph_call);
+       int old_offset, new_offset;
+
+       old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+       new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+
+       return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+int ftrace_disable_ftrace_graph_caller(void)
+{
+       unsigned long ip = (unsigned long)(&ftrace_graph_call);
+       int old_offset, new_offset;
+
+       old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
+       new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
+
+       return ftrace_mod_jmp(ip, old_offset, new_offset);
+}
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * These functions are picked from those used on
+ * this page for dynamic ftrace. They have been
+ * simplified to ignore all traces in NMI context.
+ */
+static atomic_t in_nmi;
+
+void ftrace_nmi_enter(void)
+{
+       atomic_inc(&in_nmi);
+}
+
+void ftrace_nmi_exit(void)
+{
+       atomic_dec(&in_nmi);
+}
+
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
+/* Add a function return address to the trace stack on thread info.*/
+static int push_return_trace(unsigned long ret, unsigned long long time,
+                               unsigned long func, int *depth)
+{
+       int index;
+
+       if (!current->ret_stack)
+               return -EBUSY;
+
+       /* The return trace stack is full */
+       if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+               atomic_inc(&current->trace_overrun);
+               return -EBUSY;
+       }
+
+       index = ++current->curr_ret_stack;
+       barrier();
+       current->ret_stack[index].ret = ret;
+       current->ret_stack[index].func = func;
+       current->ret_stack[index].calltime = time;
+       *depth = index;
+
+       return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+       int index;
+
+       index = current->curr_ret_stack;
+
+       if (unlikely(index < 0)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               /* Might as well panic, otherwise we have no where to go */
+               *ret = (unsigned long)panic;
+               return;
+       }
+
+       *ret = current->ret_stack[index].ret;
+       trace->func = current->ret_stack[index].func;
+       trace->calltime = current->ret_stack[index].calltime;
+       trace->overrun = atomic_read(&current->trace_overrun);
+       trace->depth = index;
+       barrier();
+       current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+       struct ftrace_graph_ret trace;
+       unsigned long ret;
+
+       pop_return_trace(&trace, &ret);
+       trace.rettime = cpu_clock(raw_smp_processor_id());
+       ftrace_graph_return(&trace);
+
+       if (unlikely(!ret)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               /* Might as well panic. What else to do? */
+               ret = (unsigned long)panic;
+       }
+
+       return ret;
+}
+
+/*
+ * Hook the return address and push it in the stack of return addrs
+ * in current thread info.
+ */
+void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
+{
+       unsigned long old;
+       unsigned long long calltime;
+       int faulted;
+       struct ftrace_graph_ent trace;
+       unsigned long return_hooker = (unsigned long)
+                               &return_to_handler;
+
+       /* Nmi's are currently unsupported */
+       if (unlikely(atomic_read(&in_nmi)))
+               return;
+
+       if (unlikely(atomic_read(&current->tracing_graph_pause)))
+               return;
+
+       /*
+        * Protect against fault, even if it shouldn't
+        * happen. This tool is too much intrusive to
+        * ignore such a protection.
+        */
+       asm volatile(
+               "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
+               "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
+               "   movl $0, %[faulted]\n"
+
+               ".section .fixup, \"ax\"\n"
+               "3: movl $1, %[faulted]\n"
+               ".previous\n"
+
+               _ASM_EXTABLE(1b, 3b)
+               _ASM_EXTABLE(2b, 3b)
+
+               : [parent_replaced] "=r" (parent), [old] "=r" (old),
+                 [faulted] "=r" (faulted)
+               : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
+               : "memory"
+       );
+
+       if (unlikely(faulted)) {
+               ftrace_graph_stop();
+               WARN_ON(1);
+               return;
+       }
+
+       if (unlikely(!__kernel_text_address(old))) {
+               ftrace_graph_stop();
+               *parent = old;
+               WARN_ON(1);
+               return;
+       }
+
+       calltime = cpu_clock(raw_smp_processor_id());
+
+       if (push_return_trace(old, calltime,
+                               self_addr, &trace.depth) == -EBUSY) {
+               *parent = old;
+               return;
+       }
+
+       trace.func = self_addr;
+
+       /* Only trace if the calling function expects to */
+       if (!ftrace_graph_entry(&trace)) {
+               current->curr_ret_stack--;
+               *parent = old;
+       }
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
index 6c9bfc9e1e95cbc5f8aa2273bd06f9eb75c2ca11..2bced78b0b8e85d6317c729c9554919cac2eb67b 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/setup.h>
 
 extern struct genapic apic_flat;
 extern struct genapic apic_physflat;
@@ -53,6 +54,9 @@ void __init setup_apic_routing(void)
                        genapic = &apic_physflat;
                printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
        }
+
+       if (x86_quirks->update_genapic)
+               x86_quirks->update_genapic();
 }
 
 /* Same for both flat and physical. */
index c0262791bda40e5cd10960fb496360ca8298b879..7fa5f49c2ddac015b8aa7bc4f132b60793efba2a 100644 (file)
@@ -30,12 +30,12 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 1;
 }
 
-static cpumask_t flat_target_cpus(void)
+static const struct cpumask *flat_target_cpus(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 
-static cpumask_t flat_vector_allocation_domain(int cpu)
+static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -45,8 +45,8 @@ static cpumask_t flat_vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 }
 
 /*
@@ -69,9 +69,8 @@ static void flat_init_apic_ldr(void)
        apic_write(APIC_LDR, val);
 }
 
-static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
+static inline void _flat_send_IPI_mask(unsigned long mask, int vector)
 {
-       unsigned long mask = cpus_addr(cpumask)[0];
        unsigned long flags;
 
        local_irq_save(flags);
@@ -79,20 +78,41 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
        local_irq_restore(flags);
 }
 
+static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
+{
+       unsigned long mask = cpumask_bits(cpumask)[0];
+
+       _flat_send_IPI_mask(mask, vector);
+}
+
+static void flat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+                                         int vector)
+{
+       unsigned long mask = cpumask_bits(cpumask)[0];
+       int cpu = smp_processor_id();
+
+       if (cpu < BITS_PER_LONG)
+               clear_bit(cpu, &mask);
+       _flat_send_IPI_mask(mask, vector);
+}
+
 static void flat_send_IPI_allbutself(int vector)
 {
+       int cpu = smp_processor_id();
 #ifdef CONFIG_HOTPLUG_CPU
        int hotplug = 1;
 #else
        int hotplug = 0;
 #endif
        if (hotplug || vector == NMI_VECTOR) {
-               cpumask_t allbutme = cpu_online_map;
+               if (!cpumask_equal(cpu_online_mask, cpumask_of(cpu))) {
+                       unsigned long mask = cpumask_bits(cpu_online_mask)[0];
 
-               cpu_clear(smp_processor_id(), allbutme);
+                       if (cpu < BITS_PER_LONG)
+                               clear_bit(cpu, &mask);
 
-               if (!cpus_empty(allbutme))
-                       flat_send_IPI_mask(allbutme, vector);
+                       _flat_send_IPI_mask(mask, vector);
+               }
        } else if (num_online_cpus() > 1) {
                __send_IPI_shortcut(APIC_DEST_ALLBUT, vector,APIC_DEST_LOGICAL);
        }
@@ -101,7 +121,7 @@ static void flat_send_IPI_allbutself(int vector)
 static void flat_send_IPI_all(int vector)
 {
        if (vector == NMI_VECTOR)
-               flat_send_IPI_mask(cpu_online_map, vector);
+               flat_send_IPI_mask(cpu_online_mask, vector);
        else
                __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
@@ -135,9 +155,18 @@ static int flat_apic_id_registered(void)
        return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
 
-static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int flat_cpu_mask_to_apicid(const struct cpumask *cpumask)
+{
+       return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+}
+
+static unsigned int flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                               const struct cpumask *andmask)
 {
-       return cpus_addr(cpumask)[0] & APIC_ALL_CPUS;
+       unsigned long mask1 = cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+       unsigned long mask2 = cpumask_bits(andmask)[0] & APIC_ALL_CPUS;
+
+       return mask1 & mask2;
 }
 
 static unsigned int phys_pkg_id(int index_msb)
@@ -157,8 +186,10 @@ struct genapic apic_flat =  {
        .send_IPI_all = flat_send_IPI_all,
        .send_IPI_allbutself = flat_send_IPI_allbutself,
        .send_IPI_mask = flat_send_IPI_mask,
+       .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
        .send_IPI_self = apic_send_IPI_self,
        .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
@@ -188,35 +219,39 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 0;
 }
 
-static cpumask_t physflat_target_cpus(void)
+static const struct cpumask *physflat_target_cpus(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 
-static cpumask_t physflat_vector_allocation_domain(int cpu)
+static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       return cpumask_of_cpu(cpu);
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
-static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
+static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
 {
        send_IPI_mask_sequence(cpumask, vector);
 }
 
-static void physflat_send_IPI_allbutself(int vector)
+static void physflat_send_IPI_mask_allbutself(const struct cpumask *cpumask,
+                                             int vector)
 {
-       cpumask_t allbutme = cpu_online_map;
+       send_IPI_mask_allbutself(cpumask, vector);
+}
 
-       cpu_clear(smp_processor_id(), allbutme);
-       physflat_send_IPI_mask(allbutme, vector);
+static void physflat_send_IPI_allbutself(int vector)
+{
+       send_IPI_mask_allbutself(cpu_online_mask, vector);
 }
 
 static void physflat_send_IPI_all(int vector)
 {
-       physflat_send_IPI_mask(cpu_online_map, vector);
+       physflat_send_IPI_mask(cpu_online_mask, vector);
 }
 
-static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
@@ -224,13 +259,29 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
+       cpu = cpumask_first(cpumask);
        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int
+physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                               const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       cpu = cpumask_any_and(cpumask, andmask);
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_apicid, cpu);
+       return BAD_APICID;
+}
+
 struct genapic apic_physflat =  {
        .name = "physical flat",
        .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
@@ -243,8 +294,10 @@ struct genapic apic_physflat =  {
        .send_IPI_all = physflat_send_IPI_all,
        .send_IPI_allbutself = physflat_send_IPI_allbutself,
        .send_IPI_mask = physflat_send_IPI_mask,
+       .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
        .send_IPI_self = apic_send_IPI_self,
        .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index f6a2c8eb48a6bb1706212636246539d5f1ce64dc..4716a0c9f9369a3c721a147aadb423850d80035e 100644 (file)
@@ -22,19 +22,18 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
 {
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 }
 
 /*
  * for now each logical cpu is in its own vector allocation domain.
  */
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       cpumask_t domain = CPU_MASK_NONE;
-       cpu_set(cpu, domain);
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
 static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -56,32 +55,53 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
  * at once. We have 16 cpu's in a cluster. This will minimize IPI register
  * writes.
  */
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
 
        local_irq_save(flags);
-       for_each_cpu_mask(query_cpu, mask) {
-               __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-                                      vector, APIC_DEST_LOGICAL);
-       }
+       for_each_cpu(query_cpu, mask)
+               __x2apic_send_IPI_dest(
+                       per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                       vector, APIC_DEST_LOGICAL);
        local_irq_restore(flags);
 }
 
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+                                           int vector)
 {
-       cpumask_t mask = cpu_online_map;
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
 
-       cpu_clear(smp_processor_id(), mask);
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask)
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                               vector, APIC_DEST_LOGICAL);
+       local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
 
-       if (!cpus_empty(mask))
-               x2apic_send_IPI_mask(mask, vector);
+       local_irq_save(flags);
+       for_each_online_cpu(query_cpu)
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+                               vector, APIC_DEST_LOGICAL);
+       local_irq_restore(flags);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-       x2apic_send_IPI_mask(cpu_online_map, vector);
+       x2apic_send_IPI_mask(cpu_online_mask, vector);
 }
 
 static int x2apic_apic_id_registered(void)
@@ -89,7 +109,7 @@ static int x2apic_apic_id_registered(void)
        return 1;
 }
 
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
@@ -97,13 +117,28 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
-       if ((unsigned)cpu < NR_CPUS)
+       cpu = cpumask_first(cpumask);
+       if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_logical_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       cpu = cpumask_any_and(cpumask, andmask);
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_apicid, cpu);
+       return BAD_APICID;
+}
+
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -150,8 +185,10 @@ struct genapic apic_x2apic_cluster = {
        .send_IPI_all = x2apic_send_IPI_all,
        .send_IPI_allbutself = x2apic_send_IPI_allbutself,
        .send_IPI_mask = x2apic_send_IPI_mask,
+       .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
        .send_IPI_self = x2apic_send_IPI_self,
        .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index d042211768b74243f0c7b7dbfd321be707109f04..b255507884f219487368b067034e4b5fc01d3871 100644 (file)
@@ -29,16 +29,15 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
-static cpumask_t x2apic_target_cpus(void)
+static const struct cpumask *x2apic_target_cpus(void)
 {
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 }
 
-static cpumask_t x2apic_vector_allocation_domain(int cpu)
+static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       cpumask_t domain = CPU_MASK_NONE;
-       cpu_set(cpu, domain);
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
 static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
@@ -54,32 +53,54 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
        x2apic_icr_write(cfg, apicid);
 }
 
-static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned long query_cpu;
 
        local_irq_save(flags);
-       for_each_cpu_mask(query_cpu, mask) {
+       for_each_cpu(query_cpu, mask) {
                __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
                                       vector, APIC_DEST_PHYSICAL);
        }
        local_irq_restore(flags);
 }
 
-static void x2apic_send_IPI_allbutself(int vector)
+static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask,
+                                           int vector)
 {
-       cpumask_t mask = cpu_online_map;
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
+
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask) {
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_apicid, query_cpu),
+                               vector, APIC_DEST_PHYSICAL);
+       }
+       local_irq_restore(flags);
+}
 
-       cpu_clear(smp_processor_id(), mask);
+static void x2apic_send_IPI_allbutself(int vector)
+{
+       unsigned long flags;
+       unsigned long query_cpu;
+       unsigned long this_cpu = smp_processor_id();
 
-       if (!cpus_empty(mask))
-               x2apic_send_IPI_mask(mask, vector);
+       local_irq_save(flags);
+       for_each_online_cpu(query_cpu)
+               if (query_cpu != this_cpu)
+                       __x2apic_send_IPI_dest(
+                               per_cpu(x86_cpu_to_apicid, query_cpu),
+                               vector, APIC_DEST_PHYSICAL);
+       local_irq_restore(flags);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-       x2apic_send_IPI_mask(cpu_online_map, vector);
+       x2apic_send_IPI_mask(cpu_online_mask, vector);
 }
 
 static int x2apic_apic_id_registered(void)
@@ -87,7 +108,7 @@ static int x2apic_apic_id_registered(void)
        return 1;
 }
 
-static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
@@ -95,13 +116,28 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
-       if ((unsigned)cpu < NR_CPUS)
+       cpu = cpumask_first(cpumask);
+       if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                                 const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       cpu = cpumask_any_and(cpumask, andmask);
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_apicid, cpu);
+       return BAD_APICID;
+}
+
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -145,8 +181,10 @@ struct genapic apic_x2apic_phys = {
        .send_IPI_all = x2apic_send_IPI_all,
        .send_IPI_allbutself = x2apic_send_IPI_allbutself,
        .send_IPI_mask = x2apic_send_IPI_mask,
+       .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
        .send_IPI_self = x2apic_send_IPI_self,
        .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index 2c7dbdb98278316a6fd7228e9a7b2eaf76f58562..3984682cd84976b06a15c695f199116b130ed33c 100644 (file)
@@ -75,16 +75,15 @@ EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
-static cpumask_t uv_target_cpus(void)
+static const struct cpumask *uv_target_cpus(void)
 {
-       return cpumask_of_cpu(0);
+       return cpumask_of(0);
 }
 
-static cpumask_t uv_vector_allocation_domain(int cpu)
+static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
 {
-       cpumask_t domain = CPU_MASK_NONE;
-       cpu_set(cpu, domain);
-       return domain;
+       cpumask_clear(retmask);
+       cpumask_set_cpu(cpu, retmask);
 }
 
 int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
@@ -123,28 +122,37 @@ static void uv_send_IPI_one(int cpu, int vector)
        uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
 }
 
-static void uv_send_IPI_mask(cpumask_t mask, int vector)
+static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
 {
        unsigned int cpu;
 
-       for_each_possible_cpu(cpu)
-               if (cpu_isset(cpu, mask))
+       for_each_cpu(cpu, mask)
+               uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+       unsigned int cpu;
+       unsigned int this_cpu = smp_processor_id();
+
+       for_each_cpu(cpu, mask)
+               if (cpu != this_cpu)
                        uv_send_IPI_one(cpu, vector);
 }
 
 static void uv_send_IPI_allbutself(int vector)
 {
-       cpumask_t mask = cpu_online_map;
-
-       cpu_clear(smp_processor_id(), mask);
+       unsigned int cpu;
+       unsigned int this_cpu = smp_processor_id();
 
-       if (!cpus_empty(mask))
-               uv_send_IPI_mask(mask, vector);
+       for_each_online_cpu(cpu)
+               if (cpu != this_cpu)
+                       uv_send_IPI_one(cpu, vector);
 }
 
 static void uv_send_IPI_all(int vector)
 {
-       uv_send_IPI_mask(cpu_online_map, vector);
+       uv_send_IPI_mask(cpu_online_mask, vector);
 }
 
 static int uv_apic_id_registered(void)
@@ -156,7 +164,7 @@ static void uv_init_apic_ldr(void)
 {
 }
 
-static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
 {
        int cpu;
 
@@ -164,13 +172,28 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
         * We're using fixed IRQ delivery, can only return one phys APIC ID.
         * May as well be the first.
         */
-       cpu = first_cpu(cpumask);
+       cpu = cpumask_first(cpumask);
        if ((unsigned)cpu < nr_cpu_ids)
                return per_cpu(x86_cpu_to_apicid, cpu);
        else
                return BAD_APICID;
 }
 
+static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+                                             const struct cpumask *andmask)
+{
+       int cpu;
+
+       /*
+        * We're using fixed IRQ delivery, can only return one phys APIC ID.
+        * May as well be the first.
+        */
+       cpu = cpumask_any_and(cpumask, andmask);
+       if (cpu < nr_cpu_ids)
+               return per_cpu(x86_cpu_to_apicid, cpu);
+       return BAD_APICID;
+}
+
 static unsigned int get_apic_id(unsigned long x)
 {
        unsigned int id;
@@ -218,8 +241,10 @@ struct genapic apic_x2apic_uv_x = {
        .send_IPI_all = uv_send_IPI_all,
        .send_IPI_allbutself = uv_send_IPI_allbutself,
        .send_IPI_mask = uv_send_IPI_mask,
+       .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
        .send_IPI_self = uv_send_IPI_self,
        .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+       .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
        .phys_pkg_id = phys_pkg_id,
        .get_apic_id = get_apic_id,
        .set_apic_id = set_apic_id,
index 067d8de913f612d45018090a1fae4e5cc920462f..e76d7e272974ffebd37a5010b2d2f4b3635c1bb7 100644 (file)
@@ -246,7 +246,7 @@ static void hpet_legacy_clockevent_register(void)
         * Start hpet with the boot cpu mask and make it
         * global after the IO_APIC has been initialized.
         */
-       hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+       hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
        clockevents_register_device(&hpet_clockevent);
        global_clock_event = &hpet_clockevent;
        printk(KERN_DEBUG "hpet clockevent registered\n");
@@ -301,7 +301,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
                        struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
                        hpet_setup_msi_irq(hdev->irq);
                        disable_irq(hdev->irq);
-                       irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+                       irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
                        enable_irq(hdev->irq);
                }
                break;
@@ -449,7 +449,7 @@ static int hpet_setup_irq(struct hpet_dev *dev)
                return -1;
 
        disable_irq(dev->irq);
-       irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+       irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
        enable_irq(dev->irq);
 
        printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
@@ -500,7 +500,7 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
        /* 5 usec minimum reprogramming delta. */
        evt->min_delta_ns = 5000;
 
-       evt->cpumask = cpumask_of_cpu(hdev->cpu);
+       evt->cpumask = cpumask_of(hdev->cpu);
        clockevents_register_device(evt);
 }
 
index c1b5e3ece1f241f9ef8716abd02d98ad6721ffa7..10f92fb532f34e1c0e0c6a93059fa845eed7049c 100644 (file)
@@ -114,7 +114,7 @@ void __init setup_pit_timer(void)
         * Start pit with the boot cpu mask and make it global after the
         * IO_APIC has been initialized.
         */
-       pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+       pit_clockevent.cpumask = cpumask_of(smp_processor_id());
        pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
                                     pit_clockevent.shift);
        pit_clockevent.max_delta_ns =
index 9043251210fba4dc32ea7db009a1db6196216453..58938cc4b7d38e0908572c8fb147dcf1f027d149 100644 (file)
@@ -108,94 +108,273 @@ static int __init parse_noapic(char *str)
 early_param("noapic", parse_noapic);
 
 struct irq_pin_list;
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+struct irq_pin_list {
+       int apic, pin;
+       struct irq_pin_list *next;
+};
+
+static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+{
+       struct irq_pin_list *pin;
+       int node;
+
+       node = cpu_to_node(cpu);
+
+       pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node);
+
+       return pin;
+}
+
 struct irq_cfg {
-       unsigned int irq;
        struct irq_pin_list *irq_2_pin;
-       cpumask_t domain;
-       cpumask_t old_domain;
+       cpumask_var_t domain;
+       cpumask_var_t old_domain;
        unsigned move_cleanup_count;
        u8 vector;
        u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+       u8 move_desc_pending : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_cfg irq_cfgx[] = {
+#else
 static struct irq_cfg irq_cfgx[NR_IRQS] = {
-       [0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-       [1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-       [2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-       [3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-       [4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-       [5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-       [6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-       [7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-       [8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-       [9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-       [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-       [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-       [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-       [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-       [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-       [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+#endif
+       [0]  = { .vector = IRQ0_VECTOR,  },
+       [1]  = { .vector = IRQ1_VECTOR,  },
+       [2]  = { .vector = IRQ2_VECTOR,  },
+       [3]  = { .vector = IRQ3_VECTOR,  },
+       [4]  = { .vector = IRQ4_VECTOR,  },
+       [5]  = { .vector = IRQ5_VECTOR,  },
+       [6]  = { .vector = IRQ6_VECTOR,  },
+       [7]  = { .vector = IRQ7_VECTOR,  },
+       [8]  = { .vector = IRQ8_VECTOR,  },
+       [9]  = { .vector = IRQ9_VECTOR,  },
+       [10] = { .vector = IRQ10_VECTOR, },
+       [11] = { .vector = IRQ11_VECTOR, },
+       [12] = { .vector = IRQ12_VECTOR, },
+       [13] = { .vector = IRQ13_VECTOR, },
+       [14] = { .vector = IRQ14_VECTOR, },
+       [15] = { .vector = IRQ15_VECTOR, },
 };
 
-#define for_each_irq_cfg(irq, cfg)             \
-       for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+void __init arch_early_irq_init(void)
+{
+       struct irq_cfg *cfg;
+       struct irq_desc *desc;
+       int count;
+       int i;
+
+       cfg = irq_cfgx;
+       count = ARRAY_SIZE(irq_cfgx);
+
+       for (i = 0; i < count; i++) {
+               desc = irq_to_desc(i);
+               desc->chip_data = &cfg[i];
+               alloc_bootmem_cpumask_var(&cfg[i].domain);
+               alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+               if (i < NR_IRQS_LEGACY)
+                       cpumask_setall(cfg[i].domain);
+       }
+}
 
+#ifdef CONFIG_SPARSE_IRQ
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
-       return irq < nr_irqs ? irq_cfgx + irq : NULL;
+       struct irq_cfg *cfg = NULL;
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+       if (desc)
+               cfg = desc->chip_data;
+
+       return cfg;
+}
+
+static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+{
+       struct irq_cfg *cfg;
+       int node;
+
+       node = cpu_to_node(cpu);
+
+       cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+       if (cfg) {
+               /* FIXME: needs alloc_cpumask_var_node() */
+               if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+                       kfree(cfg);
+                       cfg = NULL;
+               } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+                       free_cpumask_var(cfg->domain);
+                       kfree(cfg);
+                       cfg = NULL;
+               } else {
+                       cpumask_clear(cfg->domain);
+                       cpumask_clear(cfg->old_domain);
+               }
+       }
+       printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node);
+
+       return cfg;
 }
 
-static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+void arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
-       return irq_cfg(irq);
+       struct irq_cfg *cfg;
+
+       cfg = desc->chip_data;
+       if (!cfg) {
+               desc->chip_data = get_one_free_irq_cfg(cpu);
+               if (!desc->chip_data) {
+                       printk(KERN_ERR "can not alloc irq_cfg\n");
+                       BUG_ON(1);
+               }
+       }
 }
 
-/*
- * Rough estimation of how many shared IRQs there are, can be changed
- * anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
 
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+{
+       struct irq_pin_list *old_entry, *head, *tail, *entry;
 
-struct irq_pin_list {
-       int apic, pin;
-       struct irq_pin_list *next;
-};
+       cfg->irq_2_pin = NULL;
+       old_entry = old_cfg->irq_2_pin;
+       if (!old_entry)
+               return;
+
+       entry = get_one_free_irq_2_pin(cpu);
+       if (!entry)
+               return;
+
+       entry->apic     = old_entry->apic;
+       entry->pin      = old_entry->pin;
+       head            = entry;
+       tail            = entry;
+       old_entry       = old_entry->next;
+       while (old_entry) {
+               entry = get_one_free_irq_2_pin(cpu);
+               if (!entry) {
+                       entry = head;
+                       while (entry) {
+                               head = entry->next;
+                               kfree(entry);
+                               entry = head;
+                       }
+                       /* still use the old one */
+                       return;
+               }
+               entry->apic     = old_entry->apic;
+               entry->pin      = old_entry->pin;
+               tail->next      = entry;
+               tail            = entry;
+               old_entry       = old_entry->next;
+       }
 
-static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
-static struct irq_pin_list *irq_2_pin_ptr;
+       tail->next = NULL;
+       cfg->irq_2_pin = head;
+}
 
-static void __init irq_2_pin_init(void)
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
 {
-       struct irq_pin_list *pin = irq_2_pin_head;
-       int i;
+       struct irq_pin_list *entry, *next;
+
+       if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+               return;
 
-       for (i = 1; i < PIN_MAP_SIZE; i++)
-               pin[i-1].next = &pin[i];
+       entry = old_cfg->irq_2_pin;
 
-       irq_2_pin_ptr = &pin[0];
+       while (entry) {
+               next = entry->next;
+               kfree(entry);
+               entry = next;
+       }
+       old_cfg->irq_2_pin = NULL;
 }
 
-static struct irq_pin_list *get_one_free_irq_2_pin(void)
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+                                struct irq_desc *desc, int cpu)
 {
-       struct irq_pin_list *pin = irq_2_pin_ptr;
+       struct irq_cfg *cfg;
+       struct irq_cfg *old_cfg;
 
-       if (!pin)
-               panic("can not get more irq_2_pin\n");
+       cfg = get_one_free_irq_cfg(cpu);
 
-       irq_2_pin_ptr = pin->next;
-       pin->next = NULL;
-       return pin;
+       if (!cfg)
+               return;
+
+       desc->chip_data = cfg;
+
+       old_cfg = old_desc->chip_data;
+
+       memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+       init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+       kfree(old_cfg);
 }
 
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+       struct irq_cfg *old_cfg, *cfg;
+
+       old_cfg = old_desc->chip_data;
+       cfg = desc->chip_data;
+
+       if (old_cfg == cfg)
+               return;
+
+       if (old_cfg) {
+               free_irq_2_pin(old_cfg, cfg);
+               free_irq_cfg(old_cfg);
+               old_desc->chip_data = NULL;
+       }
+}
+
+static void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+       struct irq_cfg *cfg = desc->chip_data;
+
+       if (!cfg->move_in_progress) {
+               /* it means that domain is not changed */
+               if (!cpumask_intersects(&desc->affinity, mask))
+                       cfg->move_desc_pending = 1;
+       }
+}
+#endif
+
+#else
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+       return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+#endif
+
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
+static inline void
+set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+}
+#endif
+
 struct io_apic {
        unsigned int index;
        unsigned int unused[3];
@@ -237,11 +416,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
        writel(value, &io_apic->data);
 }
 
-static bool io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 {
        struct irq_pin_list *entry;
        unsigned long flags;
-       struct irq_cfg *cfg = irq_cfg(irq);
 
        spin_lock_irqsave(&ioapic_lock, flags);
        entry = cfg->irq_2_pin;
@@ -323,13 +501,32 @@ static void ioapic_mask_entry(int apic, int pin)
 }
 
 #ifdef CONFIG_SMP
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+       cpumask_var_t cleanup_mask;
+
+       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+               unsigned int i;
+               cfg->move_cleanup_count = 0;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       cfg->move_cleanup_count++;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+       } else {
+               cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+               cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+               free_cpumask_var(cleanup_mask);
+       }
+       cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
 {
        int apic, pin;
-       struct irq_cfg *cfg;
        struct irq_pin_list *entry;
+       u8 vector = cfg->vector;
 
-       cfg = irq_cfg(irq);
        entry = cfg->irq_2_pin;
        for (;;) {
                unsigned int reg;
@@ -359,36 +556,61 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
        }
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask);
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+/*
+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+       struct irq_cfg *cfg;
+       unsigned int irq;
+
+       if (!cpumask_intersects(mask, cpu_online_mask))
+               return BAD_APICID;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+       if (assign_irq_vector(irq, cfg, mask))
+               return BAD_APICID;
+
+       cpumask_and(&desc->affinity, cfg->domain, mask);
+       set_extra_move_desc(desc, mask);
+       return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+}
+
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
        unsigned long flags;
        unsigned int dest;
-       cpumask_t tmp;
-       struct irq_desc *desc;
+       unsigned int irq;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
+       irq = desc->irq;
+       cfg = desc->chip_data;
 
-       cfg = irq_cfg(irq);
-       if (assign_irq_vector(irq, mask))
-               return;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       dest = set_desc_affinity(desc, mask);
+       if (dest != BAD_APICID) {
+               /* Only the high 8 bits are valid. */
+               dest = SET_APIC_LOGICAL_ID(dest);
+               __target_IO_APIC_irq(irq, dest, cfg);
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-       /*
-        * Only the high 8 bits are valid.
-        */
-       dest = SET_APIC_LOGICAL_ID(dest);
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+       struct irq_desc *desc;
 
        desc = irq_to_desc(irq);
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __target_IO_APIC_irq(irq, dest, cfg->vector);
-       desc->affinity = mask;
-       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       set_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -397,16 +619,18 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 {
-       struct irq_cfg *cfg;
        struct irq_pin_list *entry;
 
-       /* first time to refer irq_cfg, so with new */
-       cfg = irq_cfg_alloc(irq);
        entry = cfg->irq_2_pin;
        if (!entry) {
-               entry = get_one_free_irq_2_pin();
+               entry = get_one_free_irq_2_pin(cpu);
+               if (!entry) {
+                       printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
+                                       apic, pin);
+                       return;
+               }
                cfg->irq_2_pin = entry;
                entry->apic = apic;
                entry->pin = pin;
@@ -421,7 +645,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
                entry = entry->next;
        }
 
-       entry->next = get_one_free_irq_2_pin();
+       entry->next = get_one_free_irq_2_pin(cpu);
        entry = entry->next;
        entry->apic = apic;
        entry->pin = pin;
@@ -430,11 +654,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq(unsigned int irq,
+static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
                                      int oldapic, int oldpin,
                                      int newapic, int newpin)
 {
-       struct irq_cfg *cfg = irq_cfg(irq);
        struct irq_pin_list *entry = cfg->irq_2_pin;
        int replaced = 0;
 
@@ -451,18 +674,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
 
        /* why? call replace before add? */
        if (!replaced)
-               add_pin_to_irq(irq, newapic, newpin);
+               add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
 }
 
-static inline void io_apic_modify_irq(unsigned int irq,
+static inline void io_apic_modify_irq(struct irq_cfg *cfg,
                                int mask_and, int mask_or,
                                void (*final)(struct irq_pin_list *entry))
 {
        int pin;
-       struct irq_cfg *cfg;
        struct irq_pin_list *entry;
 
-       cfg = irq_cfg(irq);
        for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
                unsigned int reg;
                pin = entry->pin;
@@ -475,9 +696,9 @@ static inline void io_apic_modify_irq(unsigned int irq,
        }
 }
 
-static void __unmask_IO_APIC_irq(unsigned int irq)
+static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 
 #ifdef CONFIG_X86_64
@@ -492,47 +713,64 @@ void io_apic_sync(struct irq_pin_list *entry)
        readl(&io_apic->data);
 }
 
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-       io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+       io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 }
 #else /* CONFIG_X86_32 */
-static void __mask_IO_APIC_irq(unsigned int irq)
+static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
 {
-       io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+       io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
 {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
                        IO_APIC_REDIR_MASKED, NULL);
 }
 
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
 {
-       io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+       io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
                        IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
 }
 #endif /* CONFIG_X86_32 */
 
-static void mask_IO_APIC_irq (unsigned int irq)
+static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+       struct irq_cfg *cfg = desc->chip_data;
        unsigned long flags;
 
+       BUG_ON(!cfg);
+
        spin_lock_irqsave(&ioapic_lock, flags);
-       __mask_IO_APIC_irq(irq);
+       __mask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void unmask_IO_APIC_irq (unsigned int irq)
+static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
 {
+       struct irq_cfg *cfg = desc->chip_data;
        unsigned long flags;
 
        spin_lock_irqsave(&ioapic_lock, flags);
-       __unmask_IO_APIC_irq(irq);
+       __unmask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+static void mask_IO_APIC_irq(unsigned int irq)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       mask_IO_APIC_irq_desc(desc);
+}
+static void unmask_IO_APIC_irq(unsigned int irq)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       unmask_IO_APIC_irq_desc(desc);
+}
+
 static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 {
        struct IO_APIC_route_entry entry;
@@ -809,7 +1047,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
  */
 static int EISA_ELCR(unsigned int irq)
 {
-       if (irq < 16) {
+       if (irq < NR_IRQS_LEGACY) {
                unsigned int port = 0x4d0 + (irq >> 3);
                return (inb(port) >> (irq & 7)) & 1;
        }
@@ -1034,7 +1272,8 @@ void unlock_vector_lock(void)
        spin_unlock(&vector_lock);
 }
 
-static int __assign_irq_vector(int irq, cpumask_t mask)
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
        /*
         * NOTE! The local APIC isn't very good at handling
@@ -1049,39 +1288,39 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
         */
        static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
        unsigned int old_vector;
-       int cpu;
-       struct irq_cfg *cfg;
-
-       cfg = irq_cfg(irq);
-
-       /* Only try and allocate irqs on cpus that are present */
-       cpus_and(mask, mask, cpu_online_map);
+       int cpu, err;
+       cpumask_var_t tmp_mask;
 
        if ((cfg->move_in_progress) || cfg->move_cleanup_count)
                return -EBUSY;
 
+       if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+               return -ENOMEM;
+
        old_vector = cfg->vector;
        if (old_vector) {
-               cpumask_t tmp;
-               cpus_and(tmp, cfg->domain, mask);
-               if (!cpus_empty(tmp))
+               cpumask_and(tmp_mask, mask, cpu_online_mask);
+               cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+               if (!cpumask_empty(tmp_mask)) {
+                       free_cpumask_var(tmp_mask);
                        return 0;
+               }
        }
 
-       for_each_cpu_mask_nr(cpu, mask) {
-               cpumask_t domain, new_mask;
+       /* Only try and allocate irqs on cpus that are present */
+       err = -ENOSPC;
+       for_each_cpu_and(cpu, mask, cpu_online_mask) {
                int new_cpu;
                int vector, offset;
 
-               domain = vector_allocation_domain(cpu);
-               cpus_and(new_mask, domain, cpu_online_map);
+               vector_allocation_domain(cpu, tmp_mask);
 
                vector = current_vector;
                offset = current_offset;
 next:
                vector += 8;
                if (vector >= first_system_vector) {
-                       /* If we run out of vectors on large boxen, must share them. */
+                       /* If out of vectors on large boxen, must share them. */
                        offset = (offset + 1) % 8;
                        vector = FIRST_DEVICE_VECTOR + offset;
                }
@@ -1094,7 +1333,7 @@ next:
                if (vector == SYSCALL_VECTOR)
                        goto next;
 #endif
-               for_each_cpu_mask_nr(new_cpu, new_mask)
+               for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                        if (per_cpu(vector_irq, new_cpu)[vector] != -1)
                                goto next;
                /* Found one! */
@@ -1102,49 +1341,47 @@ next:
                current_offset = offset;
                if (old_vector) {
                        cfg->move_in_progress = 1;
-                       cfg->old_domain = cfg->domain;
+                       cpumask_copy(cfg->old_domain, cfg->domain);
                }
-               for_each_cpu_mask_nr(new_cpu, new_mask)
+               for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
                        per_cpu(vector_irq, new_cpu)[vector] = irq;
                cfg->vector = vector;
-               cfg->domain = domain;
-               return 0;
+               cpumask_copy(cfg->domain, tmp_mask);
+               err = 0;
+               break;
        }
-       return -ENOSPC;
+       free_cpumask_var(tmp_mask);
+       return err;
 }
 
-static int assign_irq_vector(int irq, cpumask_t mask)
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 {
        int err;
        unsigned long flags;
 
        spin_lock_irqsave(&vector_lock, flags);
-       err = __assign_irq_vector(irq, mask);
+       err = __assign_irq_vector(irq, cfg, mask);
        spin_unlock_irqrestore(&vector_lock, flags);
        return err;
 }
 
-static void __clear_irq_vector(int irq)
+static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
 {
-       struct irq_cfg *cfg;
-       cpumask_t mask;
        int cpu, vector;
 
-       cfg = irq_cfg(irq);
        BUG_ON(!cfg->vector);
 
        vector = cfg->vector;
-       cpus_and(mask, cfg->domain, cpu_online_map);
-       for_each_cpu_mask_nr(cpu, mask)
+       for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
                per_cpu(vector_irq, cpu)[vector] = -1;
 
        cfg->vector = 0;
-       cpus_clear(cfg->domain);
+       cpumask_clear(cfg->domain);
 
        if (likely(!cfg->move_in_progress))
                return;
-       cpus_and(mask, cfg->old_domain, cpu_online_map);
-       for_each_cpu_mask_nr(cpu, mask) {
+       for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
                for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
                                                                vector++) {
                        if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -1162,10 +1399,14 @@ void __setup_vector_irq(int cpu)
        /* This function must be called with vector_lock held */
        int irq, vector;
        struct irq_cfg *cfg;
+       struct irq_desc *desc;
 
        /* Mark the inuse vectors */
-       for_each_irq_cfg(irq, cfg) {
-               if (!cpu_isset(cpu, cfg->domain))
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+               cfg = desc->chip_data;
+               if (!cpumask_test_cpu(cpu, cfg->domain))
                        continue;
                vector = cfg->vector;
                per_cpu(vector_irq, cpu)[vector] = irq;
@@ -1177,7 +1418,7 @@ void __setup_vector_irq(int cpu)
                        continue;
 
                cfg = irq_cfg(irq);
-               if (!cpu_isset(cpu, cfg->domain))
+               if (!cpumask_test_cpu(cpu, cfg->domain))
                        per_cpu(vector_irq, cpu)[vector] = -1;
        }
 }
@@ -1215,11 +1456,8 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 #endif
 
-static void ioapic_register_intr(int irq, unsigned long trigger)
+static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
 {
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
 
        if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
            trigger == IOAPIC_LEVEL)
@@ -1311,23 +1549,22 @@ static int setup_ioapic_entry(int apic, int irq,
        return 0;
 }
 
-static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
+static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
                              int trigger, int polarity)
 {
        struct irq_cfg *cfg;
        struct IO_APIC_route_entry entry;
-       cpumask_t mask;
+       unsigned int dest;
 
        if (!IO_APIC_IRQ(irq))
                return;
 
-       cfg = irq_cfg(irq);
+       cfg = desc->chip_data;
 
-       mask = TARGET_CPUS;
-       if (assign_irq_vector(irq, mask))
+       if (assign_irq_vector(irq, cfg, TARGET_CPUS))
                return;
 
-       cpus_and(mask, cfg->domain, mask);
+       dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
        apic_printk(APIC_VERBOSE,KERN_DEBUG
                    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
@@ -1337,16 +1574,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 
 
        if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
-                              cpu_mask_to_apicid(mask), trigger, polarity,
-                              cfg->vector)) {
+                              dest, trigger, polarity, cfg->vector)) {
                printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
                       mp_ioapics[apic].mp_apicid, pin);
-               __clear_irq_vector(irq);
+               __clear_irq_vector(irq, cfg);
                return;
        }
 
-       ioapic_register_intr(irq, trigger);
-       if (irq < 16)
+       ioapic_register_intr(irq, desc, trigger);
+       if (irq < NR_IRQS_LEGACY)
                disable_8259A_irq(irq);
 
        ioapic_write_entry(apic, pin, entry);
@@ -1356,6 +1592,9 @@ static void __init setup_IO_APIC_irqs(void)
 {
        int apic, pin, idx, irq;
        int notcon = 0;
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int cpu = boot_cpu_id;
 
        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1387,9 +1626,15 @@ static void __init setup_IO_APIC_irqs(void)
                        if (multi_timer_check(apic, irq))
                                continue;
 #endif
-                       add_pin_to_irq(irq, apic, pin);
+                       desc = irq_to_desc_alloc_cpu(irq, cpu);
+                       if (!desc) {
+                               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+                               continue;
+                       }
+                       cfg = desc->chip_data;
+                       add_pin_to_irq_cpu(cfg, cpu, apic, pin);
 
-                       setup_IO_APIC_irq(apic, pin, irq,
+                       setup_IO_APIC_irq(apic, pin, irq, desc,
                                        irq_trigger(idx), irq_polarity(idx));
                }
        }
@@ -1448,6 +1693,7 @@ __apicdebuginit(void) print_IO_APIC(void)
        union IO_APIC_reg_03 reg_03;
        unsigned long flags;
        struct irq_cfg *cfg;
+       struct irq_desc *desc;
        unsigned int irq;
 
        if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1783,13 @@ __apicdebuginit(void) print_IO_APIC(void)
        }
        }
        printk(KERN_DEBUG "IRQ to pin mappings:\n");
-       for_each_irq_cfg(irq, cfg) {
-               struct irq_pin_list *entry = cfg->irq_2_pin;
+       for_each_irq_desc(irq, desc) {
+               struct irq_pin_list *entry;
+
+               if (!desc)
+                       continue;
+               cfg = desc->chip_data;
+               entry = cfg->irq_2_pin;
                if (!entry)
                        continue;
                printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,14 +2273,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 {
        int was_pending = 0;
        unsigned long flags;
+       struct irq_cfg *cfg;
 
        spin_lock_irqsave(&ioapic_lock, flags);
-       if (irq < 16) {
+       if (irq < NR_IRQS_LEGACY) {
                disable_8259A_irq(irq);
                if (i8259A_irq_pending(irq))
                        was_pending = 1;
        }
-       __unmask_IO_APIC_irq(irq);
+       cfg = irq_cfg(irq);
+       __unmask_IO_APIC_irq(cfg);
        spin_unlock_irqrestore(&ioapic_lock, flags);
 
        return was_pending;
@@ -2043,7 +2296,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
        unsigned long flags;
 
        spin_lock_irqsave(&vector_lock, flags);
-       send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+       send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
        spin_unlock_irqrestore(&vector_lock, flags);
 
        return 1;
@@ -2092,35 +2345,35 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
  * as simple as edge triggered migration and we can do the irq migration
  * with a simple atomic update to IO-APIC RTE.
  */
-static void migrate_ioapic_irq(int irq, cpumask_t mask)
+static void
+migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
-       struct irq_desc *desc;
-       cpumask_t tmp, cleanup_mask;
        struct irte irte;
        int modify_ioapic_rte;
        unsigned int dest;
        unsigned long flags;
+       unsigned int irq;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       if (!cpumask_intersects(mask, cpu_online_mask))
                return;
 
+       irq = desc->irq;
        if (get_irte(irq, &irte))
                return;
 
-       if (assign_irq_vector(irq, mask))
+       cfg = desc->chip_data;
+       if (assign_irq_vector(irq, cfg, mask))
                return;
 
-       cfg = irq_cfg(irq);
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
+       set_extra_move_desc(desc, mask);
+
+       dest = cpu_mask_to_apicid_and(cfg->domain, mask);
 
-       desc = irq_to_desc(irq);
        modify_ioapic_rte = desc->status & IRQ_LEVEL;
        if (modify_ioapic_rte) {
                spin_lock_irqsave(&ioapic_lock, flags);
-               __target_IO_APIC_irq(irq, dest, cfg->vector);
+               __target_IO_APIC_irq(irq, dest, cfg);
                spin_unlock_irqrestore(&ioapic_lock, flags);
        }
 
@@ -2132,24 +2385,20 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
         */
        modify_irte(irq, &irte);
 
-       if (cfg->move_in_progress) {
-               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               cfg->move_in_progress = 0;
-       }
+       if (cfg->move_in_progress)
+               send_cleanup_vector(cfg);
 
-       desc->affinity = mask;
+       cpumask_copy(&desc->affinity, mask);
 }
 
-static int migrate_irq_remapped_level(int irq)
+static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 {
        int ret = -1;
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_cfg *cfg = desc->chip_data;
 
-       mask_IO_APIC_irq(irq);
+       mask_IO_APIC_irq_desc(desc);
 
-       if (io_apic_level_ack_pending(irq)) {
+       if (io_apic_level_ack_pending(cfg)) {
                /*
                 * Interrupt in progress. Migrating irq now will change the
                 * vector information in the IO-APIC RTE and that will confuse
@@ -2161,14 +2410,15 @@ static int migrate_irq_remapped_level(int irq)
        }
 
        /* everthing is clear. we have right of way */
-       migrate_ioapic_irq(irq, desc->pending_mask);
+       migrate_ioapic_irq_desc(desc, &desc->pending_mask);
 
        ret = 0;
        desc->status &= ~IRQ_MOVE_PENDING;
-       cpus_clear(desc->pending_mask);
+       cpumask_clear(&desc->pending_mask);
 
 unmask:
-       unmask_IO_APIC_irq(irq);
+       unmask_IO_APIC_irq_desc(desc);
+
        return ret;
 }
 
@@ -2178,6 +2428,9 @@ static void ir_irq_migration(struct work_struct *work)
        struct irq_desc *desc;
 
        for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+
                if (desc->status & IRQ_MOVE_PENDING) {
                        unsigned long flags;
 
@@ -2189,7 +2442,7 @@ static void ir_irq_migration(struct work_struct *work)
                                continue;
                        }
 
-                       desc->chip->set_affinity(irq, desc->pending_mask);
+                       desc->chip->set_affinity(irq, &desc->pending_mask);
                        spin_unlock_irqrestore(&desc->lock, flags);
                }
        }
@@ -2198,18 +2451,24 @@ static void ir_irq_migration(struct work_struct *work)
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+                                           const struct cpumask *mask)
 {
-       struct irq_desc *desc = irq_to_desc(irq);
-
        if (desc->status & IRQ_LEVEL) {
                desc->status |= IRQ_MOVE_PENDING;
-               desc->pending_mask = mask;
-               migrate_irq_remapped_level(irq);
+               cpumask_copy(&desc->pending_mask, mask);
+               migrate_irq_remapped_level_desc(desc);
                return;
        }
 
-       migrate_ioapic_irq(irq, mask);
+       migrate_ioapic_irq_desc(desc, mask);
+}
+static void set_ir_ioapic_affinity_irq(unsigned int irq,
+                                      const struct cpumask *mask)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif
 
@@ -2229,6 +2488,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                struct irq_cfg *cfg;
                irq = __get_cpu_var(vector_irq)[vector];
 
+               if (irq == -1)
+                       continue;
+
                desc = irq_to_desc(irq);
                if (!desc)
                        continue;
@@ -2238,7 +2500,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
                if (!cfg->move_cleanup_count)
                        goto unlock;
 
-               if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+               if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
                        goto unlock;
 
                __get_cpu_var(vector_irq)[vector] = -1;
@@ -2250,28 +2512,44 @@ unlock:
        irq_exit();
 }
 
-static void irq_complete_move(unsigned int irq)
+static void irq_complete_move(struct irq_desc **descp)
 {
-       struct irq_cfg *cfg = irq_cfg(irq);
+       struct irq_desc *desc = *descp;
+       struct irq_cfg *cfg = desc->chip_data;
        unsigned vector, me;
 
-       if (likely(!cfg->move_in_progress))
+       if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+               if (likely(!cfg->move_desc_pending))
+                       return;
+
+               /* domain is not change, but affinity is changed */
+               me = smp_processor_id();
+               if (cpu_isset(me, desc->affinity)) {
+                       *descp = desc = move_irq_desc(desc, me);
+                       /* get the new one */
+                       cfg = desc->chip_data;
+                       cfg->move_desc_pending = 0;
+               }
+#endif
                return;
+       }
 
        vector = ~get_irq_regs()->orig_ax;
        me = smp_processor_id();
-       if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
-               cpumask_t cleanup_mask;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+               *descp = desc = move_irq_desc(desc, me);
+               /* get the new one */
+               cfg = desc->chip_data;
+#endif
 
-               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               cfg->move_in_progress = 0;
-       }
+       if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+               send_cleanup_vector(cfg);
 }
 #else
-static inline void irq_complete_move(unsigned int irq) {}
+static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
+
 #ifdef CONFIG_INTR_REMAP
 static void ack_x2apic_level(unsigned int irq)
 {
@@ -2282,11 +2560,14 @@ static void ack_x2apic_edge(unsigned int irq)
 {
        ack_x2APIC_irq();
 }
+
 #endif
 
 static void ack_apic_edge(unsigned int irq)
 {
-       irq_complete_move(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       irq_complete_move(&desc);
        move_native_irq(irq);
        ack_APIC_irq();
 }
@@ -2295,18 +2576,21 @@ atomic_t irq_mis_count;
 
 static void ack_apic_level(unsigned int irq)
 {
+       struct irq_desc *desc = irq_to_desc(irq);
+
 #ifdef CONFIG_X86_32
        unsigned long v;
        int i;
 #endif
+       struct irq_cfg *cfg;
        int do_unmask_irq = 0;
 
-       irq_complete_move(irq);
+       irq_complete_move(&desc);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        /* If we are moving the irq we need to mask it */
-       if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
+       if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
                do_unmask_irq = 1;
-               mask_IO_APIC_irq(irq);
+               mask_IO_APIC_irq_desc(desc);
        }
 #endif
 
@@ -2330,7 +2614,8 @@ static void ack_apic_level(unsigned int irq)
        * operation to prevent an edge-triggered interrupt escaping meanwhile.
        * The idea is from Manfred Spraul.  --macro
        */
-       i = irq_cfg(irq)->vector;
+       cfg = desc->chip_data;
+       i = cfg->vector;
 
        v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 #endif
@@ -2369,17 +2654,18 @@ static void ack_apic_level(unsigned int irq)
                 * accurate and is causing problems then it is a hardware bug
                 * and you can go talk to the chipset vendor about it.
                 */
-               if (!io_apic_level_ack_pending(irq))
+               cfg = desc->chip_data;
+               if (!io_apic_level_ack_pending(cfg))
                        move_masked_irq(irq);
-               unmask_IO_APIC_irq(irq);
+               unmask_IO_APIC_irq_desc(desc);
        }
 
 #ifdef CONFIG_X86_32
        if (!(v & (1 << (i & 0x1f)))) {
                atomic_inc(&irq_mis_count);
                spin_lock(&ioapic_lock);
-               __mask_and_edge_IO_APIC_irq(irq);
-               __unmask_and_level_IO_APIC_irq(irq);
+               __mask_and_edge_IO_APIC_irq(cfg);
+               __unmask_and_level_IO_APIC_irq(cfg);
                spin_unlock(&ioapic_lock);
        }
 #endif
@@ -2430,20 +2716,22 @@ static inline void init_IO_APIC_traps(void)
         * Also, we've got to be careful not to trash gate
         * 0x80, because int 0x80 is hm, kind of importantish. ;)
         */
-       for_each_irq_cfg(irq, cfg) {
-               if (IO_APIC_IRQ(irq) && !cfg->vector) {
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+
+               cfg = desc->chip_data;
+               if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
                        /*
                         * Hmm.. We don't have an entry for this,
                         * so default to an old-fashioned 8259
                         * interrupt if we can..
                         */
-                       if (irq < 16)
+                       if (irq < NR_IRQS_LEGACY)
                                make_8259A_irq(irq);
-                       else {
-                               desc = irq_to_desc(irq);
+                       else
                                /* Strange. Oh, well.. */
                                desc->chip = &no_irq_chip;
-                       }
                }
        }
 }
@@ -2468,7 +2756,7 @@ static void unmask_lapic_irq(unsigned int irq)
        apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
-static void ack_lapic_irq (unsigned int irq)
+static void ack_lapic_irq(unsigned int irq)
 {
        ack_APIC_irq();
 }
@@ -2480,11 +2768,8 @@ static struct irq_chip lapic_chip __read_mostly = {
        .ack            = ack_lapic_irq,
 };
 
-static void lapic_register_intr(int irq)
+static void lapic_register_intr(int irq, struct irq_desc *desc)
 {
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
        desc->status &= ~IRQ_LEVEL;
        set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
                                      "edge");
@@ -2588,7 +2873,9 @@ int timer_through_8259 __initdata;
  */
 static inline void __init check_timer(void)
 {
-       struct irq_cfg *cfg = irq_cfg(0);
+       struct irq_desc *desc = irq_to_desc(0);
+       struct irq_cfg *cfg = desc->chip_data;
+       int cpu = boot_cpu_id;
        int apic1, pin1, apic2, pin2;
        unsigned long flags;
        unsigned int ver;
@@ -2603,7 +2890,7 @@ static inline void __init check_timer(void)
         * get/set the timer IRQ vector:
         */
        disable_8259A_irq(0);
-       assign_irq_vector(0, TARGET_CPUS);
+       assign_irq_vector(0, cfg, TARGET_CPUS);
 
        /*
         * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2654,10 +2941,10 @@ static inline void __init check_timer(void)
                 * Ok, does IRQ0 through the IOAPIC work?
                 */
                if (no_pin1) {
-                       add_pin_to_irq(0, apic1, pin1);
+                       add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
                        setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                }
-               unmask_IO_APIC_irq(0);
+               unmask_IO_APIC_irq_desc(desc);
                if (timer_irq_works()) {
                        if (nmi_watchdog == NMI_IO_APIC) {
                                setup_nmi();
@@ -2683,9 +2970,9 @@ static inline void __init check_timer(void)
                /*
                 * legacy devices should be connected to IO APIC #0
                 */
-               replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
+               replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
                setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
-               unmask_IO_APIC_irq(0);
+               unmask_IO_APIC_irq_desc(desc);
                enable_8259A_irq(0);
                if (timer_irq_works()) {
                        apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2717,7 +3004,7 @@ static inline void __init check_timer(void)
        apic_printk(APIC_QUIET, KERN_INFO
                    "...trying to set up timer as Virtual Wire IRQ...\n");
 
-       lapic_register_intr(0);
+       lapic_register_intr(0, desc);
        apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);     /* Fixed mode */
        enable_8259A_irq(0);
 
@@ -2902,22 +3189,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
        unsigned int irq;
        unsigned int new;
        unsigned long flags;
-       struct irq_cfg *cfg_new;
-
-       irq_want = nr_irqs - 1;
+       struct irq_cfg *cfg_new = NULL;
+       int cpu = boot_cpu_id;
+       struct irq_desc *desc_new = NULL;
 
        irq = 0;
        spin_lock_irqsave(&vector_lock, flags);
-       for (new = irq_want; new > 0; new--) {
+       for (new = irq_want; new < NR_IRQS; new++) {
                if (platform_legacy_irq(new))
                        continue;
-               cfg_new = irq_cfg(new);
-               if (cfg_new && cfg_new->vector != 0)
+
+               desc_new = irq_to_desc_alloc_cpu(new, cpu);
+               if (!desc_new) {
+                       printk(KERN_INFO "can not get irq_desc for %d\n", new);
                        continue;
-               /* check if need to create one */
-               if (!cfg_new)
-                       cfg_new = irq_cfg_alloc(new);
-               if (__assign_irq_vector(new, TARGET_CPUS) == 0)
+               }
+               cfg_new = desc_new->chip_data;
+
+               if (cfg_new->vector != 0)
+                       continue;
+               if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
                        irq = new;
                break;
        }
@@ -2925,15 +3216,21 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
        if (irq > 0) {
                dynamic_irq_init(irq);
+               /* restore it, in case dynamic_irq_init clear it */
+               if (desc_new)
+                       desc_new->chip_data = cfg_new;
        }
        return irq;
 }
 
+static int nr_irqs_gsi = NR_IRQS_LEGACY;
 int create_irq(void)
 {
+       unsigned int irq_want;
        int irq;
 
-       irq = create_irq_nr(nr_irqs - 1);
+       irq_want = nr_irqs_gsi;
+       irq = create_irq_nr(irq_want);
 
        if (irq == 0)
                irq = -1;
@@ -2944,14 +3241,22 @@ int create_irq(void)
 void destroy_irq(unsigned int irq)
 {
        unsigned long flags;
+       struct irq_cfg *cfg;
+       struct irq_desc *desc;
 
+       /* store it, in case dynamic_irq_cleanup clear it */
+       desc = irq_to_desc(irq);
+       cfg = desc->chip_data;
        dynamic_irq_cleanup(irq);
+       /* connect back irq_cfg */
+       if (desc)
+               desc->chip_data = cfg;
 
 #ifdef CONFIG_INTR_REMAP
        free_irte(irq);
 #endif
        spin_lock_irqsave(&vector_lock, flags);
-       __clear_irq_vector(irq);
+       __clear_irq_vector(irq, cfg);
        spin_unlock_irqrestore(&vector_lock, flags);
 }
 
@@ -2964,16 +3269,13 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
        struct irq_cfg *cfg;
        int err;
        unsigned dest;
-       cpumask_t tmp;
 
-       tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
+       cfg = irq_cfg(irq);
+       err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (err)
                return err;
 
-       cfg = irq_cfg(irq);
-       cpus_and(tmp, cfg->domain, tmp);
-       dest = cpu_mask_to_apicid(tmp);
+       dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
 #ifdef CONFIG_INTR_REMAP
        if (irq_remapped(irq)) {
@@ -3027,64 +3329,48 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-       cpumask_t tmp;
-       struct irq_desc *desc;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
-       if (assign_irq_vector(irq, mask))
-               return;
+       cfg = desc->chip_data;
 
-       cfg = irq_cfg(irq);
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-
-       read_msi_msg(irq, &msg);
+       read_msi_msg_desc(desc, &msg);
 
        msg.data &= ~MSI_DATA_VECTOR_MASK;
        msg.data |= MSI_DATA_VECTOR(cfg->vector);
        msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
-       write_msi_msg(irq, &msg);
-       desc = irq_to_desc(irq);
-       desc->affinity = mask;
+       write_msi_msg_desc(desc, &msg);
 }
-
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+static void
+ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        unsigned int dest;
-       cpumask_t tmp, cleanup_mask;
        struct irte irte;
-       struct irq_desc *desc;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
 
        if (get_irte(irq, &irte))
                return;
 
-       if (assign_irq_vector(irq, mask))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
-       cfg = irq_cfg(irq);
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
-
        irte.vector = cfg->vector;
        irte.dest_id = IRTE_DEST(dest);
 
@@ -3098,16 +3384,10 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
         * at the new destination. So, time to cleanup the previous
         * vector allocation.
         */
-       if (cfg->move_in_progress) {
-               cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
-               cfg->move_cleanup_count = cpus_weight(cleanup_mask);
-               send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               cfg->move_in_progress = 0;
-       }
-
-       desc = irq_to_desc(irq);
-       desc->affinity = mask;
+       if (cfg->move_in_progress)
+               send_cleanup_vector(cfg);
 }
+
 #endif
 #endif /* CONFIG_SMP */
 
@@ -3166,7 +3446,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 }
 #endif
 
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
 {
        int ret;
        struct msi_msg msg;
@@ -3175,7 +3455,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
        if (ret < 0)
                return ret;
 
-       set_irq_msi(irq, desc);
+       set_irq_msi(irq, msidesc);
        write_msi_msg(irq, &msg);
 
 #ifdef CONFIG_INTR_REMAP
@@ -3195,26 +3475,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
        return 0;
 }
 
-static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
-{
-       unsigned int irq;
-
-       irq = dev->bus->number;
-       irq <<= 8;
-       irq |= dev->devfn;
-       irq <<= 12;
-
-       return irq;
-}
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
 {
        unsigned int irq;
        int ret;
        unsigned int irq_want;
 
-       irq_want = build_irq_for_pci_dev(dev) + 0x100;
-
+       irq_want = nr_irqs_gsi;
        irq = create_irq_nr(irq_want);
        if (irq == 0)
                return -1;
@@ -3228,7 +3495,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
                goto error;
 no_ir:
 #endif
-       ret = setup_msi_irq(dev, desc, irq);
+       ret = setup_msi_irq(dev, msidesc, irq);
        if (ret < 0) {
                destroy_irq(irq);
                return ret;
@@ -3246,7 +3513,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
        unsigned int irq;
        int ret, sub_handle;
-       struct msi_desc *desc;
+       struct msi_desc *msidesc;
        unsigned int irq_want;
 
 #ifdef CONFIG_INTR_REMAP
@@ -3254,10 +3521,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
        int index = 0;
 #endif
 
-       irq_want = build_irq_for_pci_dev(dev) + 0x100;
+       irq_want = nr_irqs_gsi;
        sub_handle = 0;
-       list_for_each_entry(desc, &dev->msi_list, list) {
-               irq = create_irq_nr(irq_want--);
+       list_for_each_entry(msidesc, &dev->msi_list, list) {
+               irq = create_irq_nr(irq_want);
+               irq_want++;
                if (irq == 0)
                        return -1;
 #ifdef CONFIG_INTR_REMAP
@@ -3289,7 +3557,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
                }
 no_ir:
 #endif
-               ret = setup_msi_irq(dev, desc, irq);
+               ret = setup_msi_irq(dev, msidesc, irq);
                if (ret < 0)
                        goto error;
                sub_handle++;
@@ -3308,24 +3576,18 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        struct msi_msg msg;
        unsigned int dest;
-       cpumask_t tmp;
-       struct irq_desc *desc;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
 
-       if (assign_irq_vector(irq, mask))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
-       cfg = irq_cfg(irq);
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
+       cfg = desc->chip_data;
 
        dmar_msi_read(irq, &msg);
 
@@ -3335,9 +3597,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        dmar_msi_write(irq, &msg);
-       desc = irq_to_desc(irq);
-       desc->affinity = mask;
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip dmar_msi_type = {
@@ -3369,24 +3630,18 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
-       struct irq_desc *desc;
        struct msi_msg msg;
        unsigned int dest;
-       cpumask_t tmp;
 
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
-       if (assign_irq_vector(irq, mask))
-               return;
-
-       cfg = irq_cfg(irq);
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
+       cfg = desc->chip_data;
 
        hpet_msi_read(irq, &msg);
 
@@ -3396,9 +3651,8 @@ static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        hpet_msi_write(irq, &msg);
-       desc = irq_to_desc(irq);
-       desc->affinity = mask;
 }
+
 #endif /* CONFIG_SMP */
 
 struct irq_chip hpet_msi_type = {
@@ -3451,28 +3705,21 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
        write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
+       struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
        unsigned int dest;
-       cpumask_t tmp;
-       struct irq_desc *desc;
-
-       cpus_and(tmp, mask, cpu_online_map);
-       if (cpus_empty(tmp))
-               return;
 
-       if (assign_irq_vector(irq, mask))
+       dest = set_desc_affinity(desc, mask);
+       if (dest == BAD_APICID)
                return;
 
-       cfg = irq_cfg(irq);
-       cpus_and(tmp, cfg->domain, mask);
-       dest = cpu_mask_to_apicid(tmp);
+       cfg = desc->chip_data;
 
        target_ht_irq(irq, dest, cfg->vector);
-       desc = irq_to_desc(irq);
-       desc->affinity = mask;
 }
+
 #endif
 
 static struct irq_chip ht_irq_chip = {
@@ -3490,17 +3737,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 {
        struct irq_cfg *cfg;
        int err;
-       cpumask_t tmp;
 
-       tmp = TARGET_CPUS;
-       err = assign_irq_vector(irq, tmp);
+       cfg = irq_cfg(irq);
+       err = assign_irq_vector(irq, cfg, TARGET_CPUS);
        if (!err) {
                struct ht_irq_msg msg;
                unsigned dest;
 
-               cfg = irq_cfg(irq);
-               cpus_and(tmp, cfg->domain, tmp);
-               dest = cpu_mask_to_apicid(tmp);
+               dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
 
                msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
 
@@ -3536,7 +3780,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
                       unsigned long mmr_offset)
 {
-       const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+       const struct cpumask *eligible_cpu = cpumask_of(cpu);
        struct irq_cfg *cfg;
        int mmr_pnode;
        unsigned long mmr_value;
@@ -3544,7 +3788,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        unsigned long flags;
        int err;
 
-       err = assign_irq_vector(irq, *eligible_cpu);
+       cfg = irq_cfg(irq);
+
+       err = assign_irq_vector(irq, cfg, eligible_cpu);
        if (err != 0)
                return err;
 
@@ -3553,8 +3799,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
                                      irq_name);
        spin_unlock_irqrestore(&vector_lock, flags);
 
-       cfg = irq_cfg(irq);
-
        mmr_value = 0;
        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
        BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3565,7 +3809,7 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        entry->polarity = 0;
        entry->trigger = 0;
        entry->mask = 0;
-       entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+       entry->dest = cpu_mask_to_apicid(eligible_cpu);
 
        mmr_pnode = uv_blade_to_pnode(mmr_blade);
        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3606,9 +3850,16 @@ int __init io_apic_get_redir_entries (int ioapic)
        return reg_01.bits.entries;
 }
 
-int __init probe_nr_irqs(void)
+void __init probe_nr_irqs_gsi(void)
 {
-       return NR_IRQS;
+       int idx;
+       int nr = 0;
+
+       for (idx = 0; idx < nr_ioapics; idx++)
+               nr += io_apic_get_redir_entries(idx) + 1;
+
+       if (nr > nr_irqs_gsi)
+               nr_irqs_gsi = nr;
 }
 
 /* --------------------------------------------------------------------------
@@ -3707,19 +3958,31 @@ int __init io_apic_get_version(int ioapic)
 
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int cpu = boot_cpu_id;
+
        if (!IO_APIC_IRQ(irq)) {
                apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
                        ioapic);
                return -EINVAL;
        }
 
+       desc = irq_to_desc_alloc_cpu(irq, cpu);
+       if (!desc) {
+               printk(KERN_INFO "can not get irq_desc %d\n", irq);
+               return 0;
+       }
+
        /*
         * IRQs < 16 are already in the irq_2_pin[] map
         */
-       if (irq >= 16)
-               add_pin_to_irq(irq, ioapic, pin);
+       if (irq >= NR_IRQS_LEGACY) {
+               cfg = desc->chip_data;
+               add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+       }
 
-       setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
+       setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
 
        return 0;
 }
@@ -3757,7 +4020,7 @@ void __init setup_ioapic_dest(void)
        int pin, ioapic, irq, irq_entry;
        struct irq_desc *desc;
        struct irq_cfg *cfg;
-       cpumask_t mask;
+       const struct cpumask *mask;
 
        if (skip_ioapic_setup == 1)
                return;
@@ -3773,9 +4036,10 @@ void __init setup_ioapic_dest(void)
                         * when you have too many devices, because at that time only boot
                         * cpu is online.
                         */
-                       cfg = irq_cfg(irq);
+                       desc = irq_to_desc(irq);
+                       cfg = desc->chip_data;
                        if (!cfg->vector) {
-                               setup_IO_APIC_irq(ioapic, pin, irq,
+                               setup_IO_APIC_irq(ioapic, pin, irq, desc,
                                                  irq_trigger(irq_entry),
                                                  irq_polarity(irq_entry));
                                continue;
@@ -3785,19 +4049,18 @@ void __init setup_ioapic_dest(void)
                        /*
                         * Honour affinities which have been set in early boot
                         */
-                       desc = irq_to_desc(irq);
                        if (desc->status &
                            (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                               mask = desc->affinity;
+                               mask = &desc->affinity;
                        else
                                mask = TARGET_CPUS;
 
 #ifdef CONFIG_INTR_REMAP
                        if (intr_remapping_enabled)
-                               set_ir_ioapic_affinity_irq(irq, mask);
+                               set_ir_ioapic_affinity_irq_desc(desc, mask);
                        else
 #endif
-                               set_ioapic_affinity_irq(irq, mask);
+                               set_ioapic_affinity_irq_desc(desc, mask);
                }
 
        }
@@ -3846,7 +4109,6 @@ void __init ioapic_init_mappings(void)
        struct resource *ioapic_res;
        int i;
 
-       irq_2_pin_init();
        ioapic_res = ioapic_setup_resources();
        for (i = 0; i < nr_ioapics; i++) {
                if (smp_found_config) {
index f1c688e46f35a8443e031b0e8dd572800a86c097..285bbf8831faf3cd826a4dd7cbf295ea0684ec21 100644 (file)
@@ -116,18 +116,18 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 /*
  * This is only used on smaller machines.
  */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(const struct cpumask *cpumask, int vector)
 {
-       unsigned long mask = cpus_addr(cpumask)[0];
+       unsigned long mask = cpumask_bits(cpumask)[0];
        unsigned long flags;
 
        local_irq_save(flags);
-       WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+       WARN_ON(mask & ~cpumask_bits(cpu_online_mask)[0]);
        __send_IPI_dest_field(mask, vector);
        local_irq_restore(flags);
 }
 
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
+void send_IPI_mask_sequence(const struct cpumask *mask, int vector)
 {
        unsigned long flags;
        unsigned int query_cpu;
@@ -139,12 +139,24 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
         */
 
        local_irq_save(flags);
-       for_each_possible_cpu(query_cpu) {
-               if (cpu_isset(query_cpu, mask)) {
+       for_each_cpu(query_cpu, mask)
+               __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector);
+       local_irq_restore(flags);
+}
+
+void send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+{
+       unsigned long flags;
+       unsigned int query_cpu;
+       unsigned int this_cpu = smp_processor_id();
+
+       /* See Hack comment above */
+
+       local_irq_save(flags);
+       for_each_cpu(query_cpu, mask)
+               if (query_cpu != this_cpu)
                        __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
                                              vector);
-               }
-       }
        local_irq_restore(flags);
 }
 
index d1d4dc52f649cdd90f360d042b1055d7694f4e59..3f1d9d18df679c858bda94daae4d578c0d8cbf93 100644 (file)
@@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v)
        }
 
        desc = irq_to_desc(i);
+       if (!desc)
+               return 0;
+
        spin_lock_irqsave(&desc->lock, flags);
 #ifndef CONFIG_SMP
        any_count = kstat_irqs(i);
index a51382672de0c5e5fb291bd6ff6bce13396a8f73..9dc5588f336a21bfa5fb19eb83a32e022fb8db96 100644 (file)
@@ -233,25 +233,28 @@ unsigned int do_IRQ(struct pt_regs *regs)
 #ifdef CONFIG_HOTPLUG_CPU
 #include <mach_apic.h>
 
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
 {
        unsigned int irq;
        static int warned;
        struct irq_desc *desc;
 
        for_each_irq_desc(irq, desc) {
-               cpumask_t mask;
+               const struct cpumask *affinity;
 
+               if (!desc)
+                       continue;
                if (irq == 2)
                        continue;
 
-               cpus_and(mask, desc->affinity, map);
-               if (any_online_cpu(mask) == NR_CPUS) {
+               affinity = &desc->affinity;
+               if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        printk("Breaking affinity for irq %i\n", irq);
-                       mask = map;
+                       affinity = cpu_all_mask;
                }
                if (desc->chip->set_affinity)
-                       desc->chip->set_affinity(irq, mask);
+                       desc->chip->set_affinity(irq, affinity);
                else if (desc->action && !(warned++))
                        printk("Cannot set affinity for irq %i\n", irq);
        }
index 60eb84eb77a0a34232be8aafbe8a7d1e040f454f..fca2991443f5773c9c9113dda3e450c12201d042 100644 (file)
@@ -83,40 +83,43 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-void fixup_irqs(cpumask_t map)
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
 {
        unsigned int irq;
        static int warned;
        struct irq_desc *desc;
 
        for_each_irq_desc(irq, desc) {
-               cpumask_t mask;
                int break_affinity = 0;
                int set_affinity = 1;
+               const struct cpumask *affinity;
 
+               if (!desc)
+                       continue;
                if (irq == 2)
                        continue;
 
                /* interrupt's are disabled at this point */
                spin_lock(&desc->lock);
 
+               affinity = &desc->affinity;
                if (!irq_has_action(irq) ||
-                   cpus_equal(desc->affinity, map)) {
+                   cpumask_equal(affinity, cpu_online_mask)) {
                        spin_unlock(&desc->lock);
                        continue;
                }
 
-               cpus_and(mask, desc->affinity, map);
-               if (cpus_empty(mask)) {
+               if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
                        break_affinity = 1;
-                       mask = map;
+                       affinity = cpu_all_mask;
                }
 
                if (desc->chip->mask)
                        desc->chip->mask(irq);
 
                if (desc->chip->set_affinity)
-                       desc->chip->set_affinity(irq, mask);
+                       desc->chip->set_affinity(irq, affinity);
                else if (!(warned++))
                        set_affinity = 0;
 
index 845aa9803e804edd7cdc6ef5bb61e2db72f1a911..6a92f47c52e7aa7a29335b30459f82b83cfd7c82 100644 (file)
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
        /*
         * 16 old-style INTA-cycle interrupts:
         */
-       for (i = 0; i < 16; i++) {
-               /* first time call this irq_desc */
+       for (i = 0; i < NR_IRQS_LEGACY; i++) {
                struct irq_desc *desc = irq_to_desc(i);
 
                desc->status = IRQ_DISABLED;
index ff0235391285a6dc388e1d175f04a6425972d90f..40c1e62ec7857dd054930ae1f125df79e2aa524e 100644 (file)
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
        init_bsp_APIC();
        init_8259A(0);
 
-       for (i = 0; i < 16; i++) {
-               /* first time call this irq_desc */
+       for (i = 0; i < NR_IRQS_LEGACY; i++) {
                struct irq_desc *desc = irq_to_desc(i);
 
                desc->status = IRQ_DISABLED;
index 7a385746509a2a625991447e8554a20b4d992c54..37f420018a41c806bf57048d50163d0d28c640aa 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/numa.h>
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
+#include <linux/gfp.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/system.h>
 #include <asm/cacheflush.h>
 
-#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
-static u32 kexec_pgd[1024] PAGE_ALIGNED;
-#ifdef CONFIG_X86_PAE
-static u32 kexec_pmd0[1024] PAGE_ALIGNED;
-static u32 kexec_pmd1[1024] PAGE_ALIGNED;
-#endif
-static u32 kexec_pte0[1024] PAGE_ALIGNED;
-static u32 kexec_pte1[1024] PAGE_ALIGNED;
-
 static void set_idt(void *newidt, __u16 limit)
 {
        struct desc_ptr curidt;
@@ -76,6 +68,76 @@ static void load_segments(void)
 #undef __STR
 }
 
+static void machine_kexec_free_page_tables(struct kimage *image)
+{
+       free_page((unsigned long)image->arch.pgd);
+#ifdef CONFIG_X86_PAE
+       free_page((unsigned long)image->arch.pmd0);
+       free_page((unsigned long)image->arch.pmd1);
+#endif
+       free_page((unsigned long)image->arch.pte0);
+       free_page((unsigned long)image->arch.pte1);
+}
+
+static int machine_kexec_alloc_page_tables(struct kimage *image)
+{
+       image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL);
+#ifdef CONFIG_X86_PAE
+       image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+       image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL);
+#endif
+       image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+       image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL);
+       if (!image->arch.pgd ||
+#ifdef CONFIG_X86_PAE
+           !image->arch.pmd0 || !image->arch.pmd1 ||
+#endif
+           !image->arch.pte0 || !image->arch.pte1) {
+               machine_kexec_free_page_tables(image);
+               return -ENOMEM;
+       }
+       return 0;
+}
+
+static void machine_kexec_page_table_set_one(
+       pgd_t *pgd, pmd_t *pmd, pte_t *pte,
+       unsigned long vaddr, unsigned long paddr)
+{
+       pud_t *pud;
+
+       pgd += pgd_index(vaddr);
+#ifdef CONFIG_X86_PAE
+       if (!(pgd_val(*pgd) & _PAGE_PRESENT))
+               set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
+#endif
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       if (!(pmd_val(*pmd) & _PAGE_PRESENT))
+               set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+       pte = pte_offset_kernel(pmd, vaddr);
+       set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
+}
+
+static void machine_kexec_prepare_page_tables(struct kimage *image)
+{
+       void *control_page;
+       pmd_t *pmd = 0;
+
+       control_page = page_address(image->control_code_page);
+#ifdef CONFIG_X86_PAE
+       pmd = image->arch.pmd0;
+#endif
+       machine_kexec_page_table_set_one(
+               image->arch.pgd, pmd, image->arch.pte0,
+               (unsigned long)control_page, __pa(control_page));
+#ifdef CONFIG_X86_PAE
+       pmd = image->arch.pmd1;
+#endif
+       machine_kexec_page_table_set_one(
+               image->arch.pgd, pmd, image->arch.pte1,
+               __pa(control_page), __pa(control_page));
+}
+
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
@@ -87,12 +149,20 @@ static void load_segments(void)
  * reboot code buffer to allow us to avoid allocations
  * later.
  *
- * Make control page executable.
+ * - Make control page executable.
+ * - Allocate page tables
+ * - Setup page tables
  */
 int machine_kexec_prepare(struct kimage *image)
 {
+       int error;
+
        if (nx_enabled)
                set_pages_x(image->control_code_page, 1);
+       error = machine_kexec_alloc_page_tables(image);
+       if (error)
+               return error;
+       machine_kexec_prepare_page_tables(image);
        return 0;
 }
 
@@ -104,6 +174,7 @@ void machine_kexec_cleanup(struct kimage *image)
 {
        if (nx_enabled)
                set_pages_nx(image->control_code_page, 1);
+       machine_kexec_free_page_tables(image);
 }
 
 /*
@@ -150,18 +221,7 @@ void machine_kexec(struct kimage *image)
        relocate_kernel_ptr = control_page;
        page_list[PA_CONTROL_PAGE] = __pa(control_page);
        page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
-       page_list[PA_PGD] = __pa(kexec_pgd);
-       page_list[VA_PGD] = (unsigned long)kexec_pgd;
-#ifdef CONFIG_X86_PAE
-       page_list[PA_PMD_0] = __pa(kexec_pmd0);
-       page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
-       page_list[PA_PMD_1] = __pa(kexec_pmd1);
-       page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
-#endif
-       page_list[PA_PTE_0] = __pa(kexec_pte0);
-       page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
-       page_list[PA_PTE_1] = __pa(kexec_pte1);
-       page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+       page_list[PA_PGD] = __pa(image->arch.pgd);
 
        if (image->type == KEXEC_TYPE_DEFAULT)
                page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
index 3b599518c322a143c9623759061a44ca92647ecf..c12314c9e86fd512367a588646770ea0c4149a5c 100644 (file)
@@ -287,7 +287,7 @@ static struct clock_event_device mfgpt_clockevent = {
        .set_mode = mfgpt_set_mode,
        .set_next_event = mfgpt_next_event,
        .rating = 250,
-       .cpumask = CPU_MASK_ALL,
+       .cpumask = cpu_all_mask,
        .shift = 32
 };
 
index 0f4c1fd5a1f434259afd90480b5af527b326b855..45e3b69808ba6722bcc9617cdd0c883cab1a8ee2 100644 (file)
@@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early)
 {
        struct intel_mp_floating *mpf = mpf_found;
 
-       if (x86_quirks->mach_get_smp_config) {
-               if (x86_quirks->mach_get_smp_config(early))
-                       return;
-       }
+       if (!mpf)
+               return;
+
        if (acpi_lapic && early)
                return;
+
        /*
-        * ACPI supports both logical (e.g. Hyper-Threading) and physical
-        * processors, where MPS only supports physical.
+        * MPS doesn't support hyperthreading, aka only have
+        * thread 0 apic id in MPS table
         */
-       if (acpi_lapic && acpi_ioapic) {
-               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
-                      "information\n");
+       if (acpi_lapic && acpi_ioapic)
                return;
-       } else if (acpi_lapic)
-               printk(KERN_INFO "Using ACPI for processor (LAPIC) "
-                      "configuration information\n");
 
-       if (!mpf)
-               return;
+       if (x86_quirks->mach_get_smp_config) {
+               if (x86_quirks->mach_get_smp_config(early))
+                       return;
+       }
 
        printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
               mpf->mpf_specification);
index 4caff39078e0d386a0f1d2783fbc7e081582d9a9..0deea37a53cf5613477a17d304a03f8bf254325f 100644 (file)
@@ -31,7 +31,7 @@
 #include <asm/numaq.h>
 #include <asm/topology.h>
 #include <asm/processor.h>
-#include <asm/mpspec.h>
+#include <asm/genapic.h>
 #include <asm/e820.h>
 #include <asm/setup.h>
 
@@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void)
        return 1;
 }
 
+static int __init numaq_update_genapic(void)
+{
+       genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
+
+       return 0;
+}
+
 static struct x86_quirks numaq_x86_quirks __initdata = {
        .arch_pre_time_init     = numaq_pre_time_init,
        .arch_time_init         = NULL,
@@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
        .mpc_oem_pci_bus        = mpc_oem_pci_bus,
        .smp_read_mpc_oem       = smp_read_mpc_oem,
        .setup_ioapic_ids       = numaq_setup_ioapic_ids,
+       .update_genapic         = numaq_update_genapic,
 };
 
 void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
index c622772744d86fcb0dcd3e6c46a6de5323508c40..95d811a9594fda74e35b926fa8ff4777a750e37c 100644 (file)
@@ -7,7 +7,9 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
+#include <asm/apic.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -100,6 +102,9 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
        if (hlt_use_halt()) {
+               struct power_trace it;
+
+               trace_power_start(&it, POWER_CSTATE, 1);
                current_thread_info()->status &= ~TS_POLLING;
                /*
                 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +117,7 @@ void default_idle(void)
                else
                        local_irq_enable();
                current_thread_info()->status |= TS_POLLING;
+               trace_power_end(&it);
        } else {
                local_irq_enable();
                /* loop is done by the caller */
@@ -122,6 +128,21 @@ void default_idle(void)
 EXPORT_SYMBOL(default_idle);
 #endif
 
+void stop_this_cpu(void *dummy)
+{
+       local_irq_disable();
+       /*
+        * Remove this CPU:
+        */
+       cpu_clear(smp_processor_id(), cpu_online_map);
+       disable_local_APIC();
+
+       for (;;) {
+               if (hlt_works(smp_processor_id()))
+                       halt();
+       }
+}
+
 static void do_nothing(void *unused)
 {
 }
@@ -154,24 +175,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+       struct power_trace it;
+
+       trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
        if (!need_resched()) {
                __monitor((void *)&current_thread_info()->flags, 0, 0);
                smp_mb();
                if (!need_resched())
                        __mwait(ax, cx);
        }
+       trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+       struct power_trace it;
        if (!need_resched()) {
+               trace_power_start(&it, POWER_CSTATE, 1);
                __monitor((void *)&current_thread_info()->flags, 0, 0);
                smp_mb();
                if (!need_resched())
                        __sti_mwait(0, 0);
                else
                        local_irq_enable();
+               trace_power_end(&it);
        } else
                local_irq_enable();
 }
@@ -183,9 +211,13 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
+       struct power_trace it;
+
+       trace_power_start(&it, POWER_CSTATE, 0);
        local_irq_enable();
        while (!need_resched())
                cpu_relax();
+       trace_power_end(&it);
 }
 
 /*
index 0a1302fe6d45307ffa880ed9e355050a194518af..24c2276aa453dfbfe4ffabec3e05cd30d93db1e0 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/percpu.h>
 #include <linux/prctl.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
  * the task-switch, and shows up in ret_from_fork in entry.S,
  * for example.
  */
-struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+__notrace_funcgraph struct task_struct *
+__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
        struct thread_struct *prev = &prev_p->thread,
                                 *next = &next_p->thread;
index c958120fb1b6f549f853a79a26864abeb38ceb24..fbb321d53d3495990c946ecbfa51f2ddfdc2ad19 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/prctl.h>
 #include <linux/uaccess.h>
 #include <linux/io.h>
+#include <linux/ftrace.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
  * - could test fs/gs bitsliced
  *
  * Kprobes not supported here. Set the probe on schedule instead.
+ * Function graph tracer not supported too.
  */
-struct task_struct *
+__notrace_funcgraph struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
        struct thread_struct *prev = &prev_p->thread;
index 0a6d8c12e10dc7742c2e33dd29475874cb9f2b50..2c8ec1ba75e663e63b4ce2d1740b6db1fb2954d3 100644 (file)
@@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
        size_t bts_index, bts_end;
        int error;
 
-       error = ds_get_bts_end(child, &bts_end);
+       error = ds_get_bts_end(child->bts, &bts_end);
        if (error < 0)
                return error;
 
        if (bts_end <= index)
                return -EINVAL;
 
-       error = ds_get_bts_index(child, &bts_index);
+       error = ds_get_bts_index(child->bts, &bts_index);
        if (error < 0)
                return error;
 
@@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
        if (bts_end <= bts_index)
                bts_index -= bts_end;
 
-       error = ds_access_bts(child, bts_index, &bts_record);
+       error = ds_access_bts(child->bts, bts_index, &bts_record);
        if (error < 0)
                return error;
 
@@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,
        size_t end, i;
        int error;
 
-       error = ds_get_bts_index(child, &end);
+       error = ds_get_bts_index(child->bts, &end);
        if (error < 0)
                return error;
 
        if (size < (end * sizeof(struct bts_struct)))
                return -EIO;
 
-       error = ds_access_bts(child, 0, (const void **)&raw);
+       error = ds_access_bts(child->bts, 0, (const void **)&raw);
        if (error < 0)
                return error;
 
@@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,
                        return -EFAULT;
        }
 
-       error = ds_clear_bts(child);
+       error = ds_clear_bts(child->bts);
        if (error < 0)
                return error;
 
        return end;
 }
 
-static void ptrace_bts_ovfl(struct task_struct *child)
-{
-       send_sig(child->thread.bts_ovfl_signal, child, 0);
-}
-
 static int ptrace_bts_config(struct task_struct *child,
                             long cfg_size,
                             const struct ptrace_bts_config __user *ucfg)
@@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child,
                goto errout;
 
        if (cfg.flags & PTRACE_BTS_O_ALLOC) {
-               ds_ovfl_callback_t ovfl = NULL;
+               bts_ovfl_callback_t ovfl = NULL;
                unsigned int sig = 0;
 
-               /* we ignore the error in case we were not tracing child */
-               (void)ds_release_bts(child);
+               error = -EINVAL;
+               if (cfg.size < (10 * bts_cfg.sizeof_bts))
+                       goto errout;
 
                if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
                        if (!cfg.signal)
                                goto errout;
 
+                       error = -EOPNOTSUPP;
+                       goto errout;
+
                        sig  = cfg.signal;
-                       ovfl = ptrace_bts_ovfl;
                }
 
-               error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
-               if (error < 0)
+               if (child->bts) {
+                       (void)ds_release_bts(child->bts);
+                       kfree(child->bts_buffer);
+
+                       child->bts = NULL;
+                       child->bts_buffer = NULL;
+               }
+
+               error = -ENOMEM;
+               child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
+               if (!child->bts_buffer)
+                       goto errout;
+
+               child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
+                                           ovfl, /* th = */ (size_t)-1);
+               if (IS_ERR(child->bts)) {
+                       error = PTR_ERR(child->bts);
+                       kfree(child->bts_buffer);
+                       child->bts = NULL;
+                       child->bts_buffer = NULL;
                        goto errout;
+               }
 
                child->thread.bts_ovfl_signal = sig;
        }
@@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child,
        if (cfg_size < sizeof(cfg))
                return -EIO;
 
-       error = ds_get_bts_end(child, &end);
+       error = ds_get_bts_end(child->bts, &end);
        if (error < 0)
                return error;
 
-       error = ds_access_bts(child, /* index = */ 0, &base);
+       error = ds_access_bts(child->bts, /* index = */ 0, &base);
        if (error < 0)
                return error;
 
-       error = ds_access_bts(child, /* index = */ end, &max);
+       error = ds_access_bts(child->bts, /* index = */ end, &max);
        if (error < 0)
                return error;
 
@@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child,
                return -EINVAL;
        }
 
-       /* The writing task will be the switched-to task on a context
-        * switch. It needs to write into the switched-from task's BTS
-        * buffer. */
-       return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
+       return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
 }
 
 void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
        switch (c->x86) {
        case 0x6:
                switch (c->x86_model) {
+               case 0 ... 0xC:
+                       /* sorry, don't know about them */
+                       break;
                case 0xD:
                case 0xE: /* Pentium M */
                        bts_configure(&bts_cfg_pentium_m);
                        break;
-               case 0xF: /* Core2 */
-        case 0x1C: /* Atom */
+               default: /* Core2, Atom, ... */
                        bts_configure(&bts_cfg_core2);
                        break;
-               default:
-                       /* sorry, don't know about them */
-                       break;
                }
                break;
        case 0xF:
@@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child)
        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
 #ifdef CONFIG_X86_PTRACE_BTS
-       (void)ds_release_bts(child);
+       if (child->bts) {
+               (void)ds_release_bts(child->bts);
+               kfree(child->bts_buffer);
+               child->bts_buffer = NULL;
 
-       child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
-       if (!child->thread.debugctlmsr)
-               clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+               child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+               if (!child->thread.debugctlmsr)
+                       clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 
-       clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+               clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+       }
 #endif /* CONFIG_X86_PTRACE_BTS */
 }
 
@@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                        (child, data, (struct ptrace_bts_config __user *)addr);
                break;
 
-       case PTRACE_BTS_SIZE:
-               ret = ds_get_bts_index(child, /* pos = */ NULL);
+       case PTRACE_BTS_SIZE: {
+               size_t size;
+
+               ret = ds_get_bts_index(child->bts, &size);
+               if (ret == 0) {
+                       BUG_ON(size != (int) size);
+                       ret = (int) size;
+               }
                break;
+       }
 
        case PTRACE_BTS_GET:
                ret = ptrace_bts_read_record
@@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                break;
 
        case PTRACE_BTS_CLEAR:
-               ret = ds_clear_bts(child);
+               ret = ds_clear_bts(child->bts);
                break;
 
        case PTRACE_BTS_DRAIN:
index cc5a2545dd41c0ce42b96eafffe85230773162e4..ba7b9a0e606358a27789b13eb25fc0ca9778dd66 100644 (file)
@@ -21,6 +21,9 @@
 # include <asm/iommu.h>
 #endif
 
+#include <mach_ipi.h>
+
+
 /*
  * Power off function, if any
  */
@@ -36,7 +39,10 @@ int reboot_force;
 static int reboot_cpu = -1;
 #endif
 
-/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old]
+/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
+bool port_cf9_safe = false;
+
+/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
    warm   Don't set the cold reboot flag
    cold   Set the cold reboot flag
    bios   Reboot by jumping through the BIOS (only for X86_32)
@@ -45,6 +51,7 @@ static int reboot_cpu = -1;
    kbd    Use the keyboard controller. cold reset (default)
    acpi   Use the RESET_REG in the FADT
    efi    Use efi reset_system runtime service
+   pci    Use the so-called "PCI reset register", CF9
    force  Avoid anything that could hang.
  */
 static int __init reboot_setup(char *str)
@@ -79,6 +86,7 @@ static int __init reboot_setup(char *str)
                case 'k':
                case 't':
                case 'e':
+               case 'p':
                        reboot_type = *str;
                        break;
 
@@ -404,12 +412,27 @@ static void native_machine_emergency_restart(void)
                        reboot_type = BOOT_KBD;
                        break;
 
-
                case BOOT_EFI:
                        if (efi_enabled)
-                               efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD,
+                               efi.reset_system(reboot_mode ?
+                                                EFI_RESET_WARM :
+                                                EFI_RESET_COLD,
                                                 EFI_SUCCESS, 0, NULL);
+                       reboot_type = BOOT_KBD;
+                       break;
 
+               case BOOT_CF9:
+                       port_cf9_safe = true;
+                       /* fall through */
+
+               case BOOT_CF9_COND:
+                       if (port_cf9_safe) {
+                               u8 cf9 = inb(0xcf9) & ~6;
+                               outb(cf9|2, 0xcf9); /* Request hard reset */
+                               udelay(50);
+                               outb(cf9|6, 0xcf9); /* Actually do the reset */
+                               udelay(50);
+                       }
                        reboot_type = BOOT_KBD;
                        break;
                }
@@ -470,6 +493,11 @@ static void native_machine_restart(char *__unused)
 
 static void native_machine_halt(void)
 {
+       /* stop other cpus and apics */
+       machine_shutdown();
+
+       /* stop this cpu */
+       stop_this_cpu(NULL);
 }
 
 static void native_machine_power_off(void)
@@ -523,3 +551,92 @@ void machine_crash_shutdown(struct pt_regs *regs)
        machine_ops.crash_shutdown(regs);
 }
 #endif
+
+
+#if defined(CONFIG_SMP)
+
+/* This keeps a track of which one is crashing cpu. */
+static int crashing_cpu;
+static nmi_shootdown_cb shootdown_callback;
+
+static atomic_t waiting_for_crash_ipi;
+
+static int crash_nmi_callback(struct notifier_block *self,
+                       unsigned long val, void *data)
+{
+       int cpu;
+
+       if (val != DIE_NMI_IPI)
+               return NOTIFY_OK;
+
+       cpu = raw_smp_processor_id();
+
+       /* Don't do anything if this handler is invoked on crashing cpu.
+        * Otherwise, system will completely hang. Crashing cpu can get
+        * an NMI if system was initially booted with nmi_watchdog parameter.
+        */
+       if (cpu == crashing_cpu)
+               return NOTIFY_STOP;
+       local_irq_disable();
+
+       shootdown_callback(cpu, (struct die_args *)data);
+
+       atomic_dec(&waiting_for_crash_ipi);
+       /* Assume hlt works */
+       halt();
+       for (;;)
+               cpu_relax();
+
+       return 1;
+}
+
+static void smp_send_nmi_allbutself(void)
+{
+       send_IPI_allbutself(NMI_VECTOR);
+}
+
+static struct notifier_block crash_nmi_nb = {
+       .notifier_call = crash_nmi_callback,
+};
+
+/* Halt all other CPUs, calling the specified function on each of them
+ *
+ * This function can be used to halt all other CPUs on crash
+ * or emergency reboot time. The function passed as parameter
+ * will be called inside a NMI handler on all CPUs.
+ */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+       unsigned long msecs;
+       local_irq_disable();
+
+       /* Make a note of crashing cpu. Will be used in NMI callback.*/
+       crashing_cpu = safe_smp_processor_id();
+
+       shootdown_callback = callback;
+
+       atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
+       /* Would it be better to replace the trap vector here? */
+       if (register_die_notifier(&crash_nmi_nb))
+               return;         /* return what? */
+       /* Ensure the new callback function is set before sending
+        * out the NMI
+        */
+       wmb();
+
+       smp_send_nmi_allbutself();
+
+       msecs = 1000; /* Wait at most a second for the other cpus to stop */
+       while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) {
+               mdelay(1);
+               msecs--;
+       }
+
+       /* Leave the nmi callback set */
+}
+#else /* !CONFIG_SMP */
+void nmi_shootdown_cpus(nmi_shootdown_cb callback)
+{
+       /* No other CPUs to shoot down */
+}
+#endif
index 6f50664b2ba50564dfbf10c017f52bcc247a9af8..a160f31197256680c32b835b7c32d62e9dbd184b 100644 (file)
 #include <asm/page.h>
 #include <asm/kexec.h>
 #include <asm/processor-flags.h>
-#include <asm/pgtable.h>
 
 /*
  * Must be relocatable PIC code callable as a C function
  */
 
 #define PTR(x) (x << 2)
-#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define PAE_PGD_ATTR (_PAGE_PRESENT)
 
 /* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
  * ~ control_page + PAGE_SIZE are used as data storage and stack for
@@ -39,7 +36,6 @@
 #define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
 
        .text
-       .align PAGE_SIZE
        .globl relocate_kernel
 relocate_kernel:
        /* Save the CPU context, used for jumping back */
@@ -60,117 +56,6 @@ relocate_kernel:
        movl    %cr4, %eax
        movl    %eax, CR4(%edi)
 
-#ifdef CONFIG_X86_PAE
-       /* map the control page at its virtual address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xc0000000, %eax
-       shrl    $27, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PMD_0)(%ebp), %edx
-       orl     $PAE_PGD_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PMD_0)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x3fe00000, %eax
-       shrl    $18, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_0)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_0)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x001ff000, %eax
-       shrl    $9, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       /* identity map the control page at its physical address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xc0000000, %eax
-       shrl    $27, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PMD_1)(%ebp), %edx
-       orl     $PAE_PGD_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PMD_1)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x3fe00000, %eax
-       shrl    $18, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_1)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_1)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x001ff000, %eax
-       shrl    $9, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-#else
-       /* map the control page at its virtual address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xffc00000, %eax
-       shrl    $20, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_0)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_0)(%ebp), %edi
-       movl    PTR(VA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x003ff000, %eax
-       shrl    $10, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       /* identity map the control page at its physical address */
-
-       movl    PTR(VA_PGD)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0xffc00000, %eax
-       shrl    $20, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_PTE_1)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-
-       movl    PTR(VA_PTE_1)(%ebp), %edi
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %eax
-       andl    $0x003ff000, %eax
-       shrl    $10, %eax
-       addl    %edi, %eax
-
-       movl    PTR(PA_CONTROL_PAGE)(%ebp), %edx
-       orl     $PAGE_ATTR, %edx
-       movl    %edx, (%eax)
-#endif
-
-relocate_new_kernel:
        /* read the arguments and say goodbye to the stack */
        movl  20+4(%esp), %ebx /* page_list */
        movl  20+8(%esp), %ebp /* list of pages */
index bdec76e5559433d77366b85535f0563451614501..32fe42f26e795daa1ec8af9919be1c3f56b108fe 100644 (file)
@@ -583,7 +583,20 @@ static int __init setup_elfcorehdr(char *arg)
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
-static struct x86_quirks default_x86_quirks __initdata;
+static int __init default_update_genapic(void)
+{
+#ifdef CONFIG_X86_SMP
+# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
+       genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
+# endif
+#endif
+
+       return 0;
+}
+
+static struct x86_quirks default_x86_quirks __initdata = {
+       .update_genapic         = default_update_genapic,
+};
 
 struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
 
@@ -1080,7 +1093,7 @@ void __init setup_arch(char **cmdline_p)
        ioapic_init_mappings();
 
        /* need to wait for io_apic is mapped */
-       nr_irqs = probe_nr_irqs();
+       probe_nr_irqs_gsi();
 
        kvm_guest_init();
 
index ae0c0d3bb7704605440467d26ec48eaf84faad83..0b63b08e75304a1da386a1686b7ed0d647541081 100644 (file)
@@ -152,6 +152,11 @@ void __init setup_per_cpu_areas(void)
        old_size = PERCPU_ENOUGH_ROOM;
        align = max_t(unsigned long, PAGE_SIZE, align);
        size = roundup(old_size, align);
+
+       printk(KERN_INFO
+               "NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
+               NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
+
        printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
                          size);
 
@@ -168,24 +173,24 @@ void __init setup_per_cpu_areas(void)
                               "cpu %d has no node %d or node-local memory\n",
                                cpu, node);
                        if (ptr)
-                               printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+                               printk(KERN_DEBUG
+                                       "per cpu data for cpu%d at %016lx\n",
                                         cpu, __pa(ptr));
                }
                else {
                        ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
                                                        __pa(MAX_DMA_ADDRESS));
                        if (ptr)
-                               printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
-                                        cpu, node, __pa(ptr));
+                               printk(KERN_DEBUG
+                                       "per cpu data for cpu%d on node%d "
+                                       "at %016lx\n",
+                                       cpu, node, __pa(ptr));
                }
 #endif
                per_cpu_offset(cpu) = ptr - __per_cpu_start;
                memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
        }
 
-       printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
-               NR_CPUS, nr_cpu_ids, nr_node_ids);
-
        /* Setup percpu data maps */
        setup_per_cpu_maps();
 
@@ -282,7 +287,7 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
        else
                cpu_clear(cpu, *mask);
 
-       cpulist_scnprintf(buf, sizeof(buf), *mask);
+       cpulist_scnprintf(buf, sizeof(buf), mask);
        printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
                enable? "numa_add_cpu":"numa_remove_cpu", cpu, node, buf);
  }
index 18f9b19f5f8f5d46582b64501d83d11cb4376eb6..49ed667b06f3684de2b9a72ff89c8e49c8bf1e3c 100644 (file)
@@ -118,41 +118,28 @@ static void native_smp_send_reschedule(int cpu)
                WARN_ON(1);
                return;
        }
-       send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+       send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
 }
 
 void native_send_call_func_single_ipi(int cpu)
 {
-       send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+       send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
 }
 
-void native_send_call_func_ipi(cpumask_t mask)
+void native_send_call_func_ipi(const struct cpumask *mask)
 {
        cpumask_t allbutself;
 
        allbutself = cpu_online_map;
        cpu_clear(smp_processor_id(), allbutself);
 
-       if (cpus_equal(mask, allbutself) &&
+       if (cpus_equal(*mask, allbutself) &&
            cpus_equal(cpu_online_map, cpu_callout_map))
                send_IPI_allbutself(CALL_FUNCTION_VECTOR);
        else
                send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
 }
 
-static void stop_this_cpu(void *dummy)
-{
-       local_irq_disable();
-       /*
-        * Remove this CPU:
-        */
-       cpu_clear(smp_processor_id(), cpu_online_map);
-       disable_local_APIC();
-       if (hlt_works(smp_processor_id()))
-               for (;;) halt();
-       for (;;);
-}
-
 /*
  * this function calls the 'stop' function on all other CPUs in the system.
  */
index f71f96fc9e62ea8e4f027a78cb3aafcd96fdbf99..be9466788043c1bb00e435cb9bb11eb0544436fb 100644 (file)
@@ -62,6 +62,7 @@
 #include <asm/mtrr.h>
 #include <asm/vmi.h>
 #include <asm/genapic.h>
+#include <asm/setup.h>
 #include <linux/mc146818rtc.h>
 
 #include <mach_apic.h>
@@ -101,14 +102,8 @@ EXPORT_SYMBOL(smp_num_siblings);
 /* Last level cache ID of each logical CPU */
 DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
 
-/* bitmap of online cpus */
-cpumask_t cpu_online_map __read_mostly;
-EXPORT_SYMBOL(cpu_online_map);
-
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
 
 /* representing HT siblings of each logical CPU */
 DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
@@ -534,7 +529,7 @@ static void impress_friends(void)
        pr_debug("Before bogocount - setting activated=1.\n");
 }
 
-static inline void __inquire_remote_apic(int apicid)
+void __inquire_remote_apic(int apicid)
 {
        unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
        char *names[] = { "ID", "VERSION", "SPIV" };
@@ -573,14 +568,13 @@ static inline void __inquire_remote_apic(int apicid)
        }
 }
 
-#ifdef WAKE_SECONDARY_VIA_NMI
 /*
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
  */
-static int __devinit
-wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 {
        unsigned long send_status, accept_status = 0;
        int maxlvt;
@@ -597,7 +591,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
         * Give the other CPU some time to accept the IPI.
         */
        udelay(200);
-       if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+       if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
                maxlvt = lapic_get_maxlvt();
                if (maxlvt > 3)                 /* Due to the Pentium erratum 3AP.  */
                        apic_write(APIC_ESR, 0);
@@ -612,11 +606,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 
        return (send_status | accept_status);
 }
-#endif /* WAKE_SECONDARY_VIA_NMI */
 
-#ifdef WAKE_SECONDARY_VIA_INIT
-static int __devinit
-wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
+int __devinit
+wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
        unsigned long send_status, accept_status = 0;
        int maxlvt, num_starts, j;
@@ -735,7 +727,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 
        return (send_status | accept_status);
 }
-#endif /* WAKE_SECONDARY_VIA_INIT */
 
 struct create_idle {
        struct work_struct work;
@@ -1353,7 +1344,7 @@ void cpu_disable_common(void)
        lock_vector_lock();
        remove_cpu_from_maps(cpu);
        unlock_vector_lock();
-       fixup_irqs(cpu_online_map);
+       fixup_irqs();
 }
 
 int native_cpu_disable(void)
index a03e7f6d90c35af5f4638f6394e4a39f6e2e4020..10786af95545ea37654bcf7125d7a880edc03540 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/sched.h>
 #include <linux/stacktrace.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 
 static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
                trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
+
+/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
+
+struct stack_frame {
+       const void __user       *next_fp;
+       unsigned long           ret_addr;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+       int ret;
+
+       if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+               return 0;
+
+       ret = 1;
+       pagefault_disable();
+       if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+               ret = 0;
+       pagefault_enable();
+
+       return ret;
+}
+
+static inline void __save_stack_trace_user(struct stack_trace *trace)
+{
+       const struct pt_regs *regs = task_pt_regs(current);
+       const void __user *fp = (const void __user *)regs->bp;
+
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = regs->ip;
+
+       while (trace->nr_entries < trace->max_entries) {
+               struct stack_frame frame;
+
+               frame.next_fp = NULL;
+               frame.ret_addr = 0;
+               if (!copy_stack_frame(fp, &frame))
+                       break;
+               if ((unsigned long)fp < regs->sp)
+                       break;
+               if (frame.ret_addr) {
+                       trace->entries[trace->nr_entries++] =
+                               frame.ret_addr;
+               }
+               if (fp == frame.next_fp)
+                       break;
+               fp = frame.next_fp;
+       }
+}
+
+void save_stack_trace_user(struct stack_trace *trace)
+{
+       /*
+        * Trace user stack if we are not a kernel thread
+        */
+       if (current->mm) {
+               __save_stack_trace_user(trace);
+       }
+       if (trace->nr_entries < trace->max_entries)
+               trace->entries[trace->nr_entries++] = ULONG_MAX;
+}
+
index f4049f3513b6b5e01e71523a5490fbedd5db9992..174ea90d1cbd5cd281b0c0eac6bf2ebb3e82bcfe 100644 (file)
@@ -164,7 +164,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
         * We have to send the IPI only to
         * CPUs affected.
         */
-       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+       send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
 
        while (!cpus_empty(flush_cpumask))
                /* nothing. lockup detection does not belong here */
index 8f919ca69494d62541453202b27f4ec13903d2f8..de6f1bda0c50e522fd05f8d40b4ce27d445f4ea0 100644 (file)
@@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
         * We have to send the IPI only to
         * CPUs affected.
         */
-       send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+       send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
 
        while (!cpus_empty(f->flush_cpumask))
                cpu_relax();
index 254ee07f8635007262591bbdf595f819caa1358d..c4c1f9e094027089373189f7e57cb7ede8149be9 100644 (file)
@@ -226,7 +226,7 @@ static void __devinit vmi_time_init_clockevent(void)
        /* Upper bound is clockevent's use of ulong for cycle deltas. */
        evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
        evt->min_delta_ns = clockevent_delta2ns(1, evt);
-       evt->cpumask = cpumask_of_cpu(cpu);
+       evt->cpumask = cpumask_of(cpu);
 
        printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n",
               evt->name, evt->mult, evt->shift);
index 0b8b6690a86d184959703177b9cb6011bfd605ca..6f3d3d4cd97338162e6889f4eaae86b744f2a2ab 100644 (file)
@@ -17,6 +17,9 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
index a5d8e1ace1cf700922d1b7268401e4a4feed52c2..104c8220a383f0c736abf536ee56fae735ec3b2d 100644 (file)
@@ -737,7 +737,7 @@ static void lguest_time_init(void)
 
        /* We can't set cpumask in the initializer: damn C limitations!  Set it
         * here and register our timer device. */
-       lguest_clockevent.cpumask = cpumask_of_cpu(0);
+       lguest_clockevent.cpumask = cpumask_of(0);
        clockevents_register_device(&lguest_clockevent);
 
        /* Finally, we unblock the timer interrupt. */
index 9e68075544f6dbb5e9a6fbc002e3a2df3091dc6c..4a20b2f9a381a360b46246c2c21c941258c1367a 100644 (file)
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
 #define __do_strncpy_from_user(dst, src, count, res)                      \
 do {                                                                      \
        int __d0, __d1, __d2;                                              \
-       might_sleep();                                                     \
+       might_fault();                                                     \
        __asm__ __volatile__(                                              \
                "       testl %1,%1\n"                                     \
                "       jz 2f\n"                                           \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 #define __do_clear_user(addr,size)                                     \
 do {                                                                   \
        int __d0;                                                       \
-       might_sleep();                                                  \
+       might_fault();                                                  \
        __asm__ __volatile__(                                           \
                "0:     rep; stosl\n"                                   \
                "       movl %2,%0\n"                                   \
@@ -155,7 +155,7 @@ do {                                                                        \
 unsigned long
 clear_user(void __user *to, unsigned long n)
 {
-       might_sleep();
+       might_fault();
        if (access_ok(VERIFY_WRITE, to, n))
                __do_clear_user(to, n);
        return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
        unsigned long mask = -__addr_ok(s);
        unsigned long res, tmp;
 
-       might_sleep();
+       might_fault();
 
        __asm__ __volatile__(
                "       testl %0, %0\n"
index f4df6e7c718be506a59ef157bda9591dfb1790a6..64d6c84e6353e9d50ce3d7f7cda6ee7178ca3494 100644 (file)
@@ -15,7 +15,7 @@
 #define __do_strncpy_from_user(dst,src,count,res)                         \
 do {                                                                      \
        long __d0, __d1, __d2;                                             \
-       might_sleep();                                                     \
+       might_fault();                                                     \
        __asm__ __volatile__(                                              \
                "       testq %1,%1\n"                                     \
                "       jz 2f\n"                                           \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
 unsigned long __clear_user(void __user *addr, unsigned long size)
 {
        long __d0;
-       might_sleep();
+       might_fault();
        /* no memory constraint because it doesn't change any memory gcc knows
           about */
        asm volatile(
index 3c3b471ea496225e5a53029b92cfd5eeee658e43..bc4c7840b2a8daad937ccf34751f3e2718e9781c 100644 (file)
@@ -17,6 +17,7 @@
 #include <asm/bigsmp/apic.h>
 #include <asm/bigsmp/ipi.h>
 #include <asm/mach-default/mach_mpparse.h>
+#include <asm/mach-default/mach_wakecpu.h>
 
 static int dmi_bigsmp; /* can be set by dmi scanners */
 
@@ -41,9 +42,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
         { }
 };
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
-        return cpumask_of_cpu(cpu);
+       cpus_clear(*retmask);
+       cpu_set(cpu, *retmask);
 }
 
 static int probe_bigsmp(void)
index 9e835a11a13a7cba9c16f2de4e5db0e54d9ecbc5..e63a4a76d8cdd2a966aec1299d941b5e5fb7196b 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/mach-default/mach_apic.h>
 #include <asm/mach-default/mach_ipi.h>
 #include <asm/mach-default/mach_mpparse.h>
+#include <asm/mach-default/mach_wakecpu.h>
 
 /* should be called last. */
 static int probe_default(void)
index 28459cab3ddb5fdae0d560e5f766b141412e00bb..4ba5ccaa15842f358c5598b63536c06b51e637c7 100644 (file)
 #include <asm/es7000/apic.h>
 #include <asm/es7000/ipi.h>
 #include <asm/es7000/mpparse.h>
-#include <asm/es7000/wakecpu.h>
+#include <asm/mach-default/mach_wakecpu.h>
+
+void __init es7000_update_genapic_to_cluster(void)
+{
+       genapic->target_cpus = target_cpus_cluster;
+       genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER;
+       genapic->int_dest_mode = INT_DEST_MODE_CLUSTER;
+       genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER;
+
+       genapic->init_apic_ldr = init_apic_ldr_cluster;
+
+       genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster;
+}
 
 static int probe_es7000(void)
 {
@@ -75,7 +87,7 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 }
 #endif
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -85,8 +97,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
 }
 
 struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
index 71a309b122e672ef948fbe125f3637588ecf1149..511d7941364f8e268e0e00505ef63201c2247749 100644 (file)
@@ -38,7 +38,7 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
        return 0;
 }
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -48,8 +48,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
 }
 
 struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
index 5a7e4619e1c47589fc6745d5843c7ef743170318..c346d9d0226f313129917b091e4a4e94bcde6105 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
 #include <asm/genapic.h>
+#include <asm/setup.h>
 
 extern struct genapic apic_numaq;
 extern struct genapic apic_summit;
@@ -57,6 +58,9 @@ static int __init parse_apic(char *arg)
                }
        }
 
+       if (x86_quirks->update_genapic)
+               x86_quirks->update_genapic();
+
        /* Parsed again by __setup for debug/verbose */
        return 0;
 }
@@ -72,12 +76,15 @@ void __init generic_bigsmp_probe(void)
         * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
         */
 
-       if (!cmdline_apic && genapic == &apic_default)
+       if (!cmdline_apic && genapic == &apic_default) {
                if (apic_bigsmp.probe()) {
                        genapic = &apic_bigsmp;
+                       if (x86_quirks->update_genapic)
+                               x86_quirks->update_genapic();
                        printk(KERN_INFO "Overriding APIC driver with %s\n",
                               genapic->name);
                }
+       }
 #endif
 }
 
@@ -94,6 +101,9 @@ void __init generic_apic_probe(void)
                /* Not visible without early console */
                if (!apic_probe[i])
                        panic("Didn't find an APIC driver");
+
+               if (x86_quirks->update_genapic)
+                       x86_quirks->update_genapic();
        }
        printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
 }
@@ -108,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
                if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {
                        if (!cmdline_apic) {
                                genapic = apic_probe[i];
+                               if (x86_quirks->update_genapic)
+                                       x86_quirks->update_genapic();
                                printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                                       genapic->name);
                        }
@@ -124,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
                if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
                        if (!cmdline_apic) {
                                genapic = apic_probe[i];
+                               if (x86_quirks->update_genapic)
+                                       x86_quirks->update_genapic();
                                printk(KERN_INFO "Switched to APIC driver `%s'.\n",
                                       genapic->name);
                        }
index 6272b5e69da62b28f660105d9a8788e4944432b3..2821ffc188b57d9c1cb5a54d69a63fade0c6c198 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/summit/apic.h>
 #include <asm/summit/ipi.h>
 #include <asm/summit/mpparse.h>
+#include <asm/mach-default/mach_wakecpu.h>
 
 static int probe_summit(void)
 {
@@ -23,7 +24,7 @@ static int probe_summit(void)
        return 0;
 }
 
-static cpumask_t vector_allocation_domain(int cpu)
+static void vector_allocation_domain(int cpu, cpumask_t *retmask)
 {
        /* Careful. Some cpus do not strictly honor the set of cpus
         * specified in the interrupt destination when using lowest
@@ -33,8 +34,7 @@ static cpumask_t vector_allocation_domain(int cpu)
         * deliver interrupts to the wrong hyperthread when only one
         * hyperthread was specified in the interrupt desitination.
         */
-       cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
-       return domain;
+       *retmask = (cpumask_t){ { [0] = APIC_ALL_CPUS, } };
 }
 
 struct genapic apic_summit = APIC_INIT("summit", probe_summit);
index 52145007bd7efb8e99a9516ddeb09af6ea1e445b..a5bc05492b1ee17c3cdfc04894cfe706047ef4f7 100644 (file)
@@ -63,11 +63,6 @@ static int voyager_extended_cpus = 1;
 /* Used for the invalidate map that's also checked in the spinlock */
 static volatile unsigned long smp_invalidate_needed;
 
-/* Bitmask of currently online CPUs - used by setup.c for
-   /proc/cpuinfo, visible externally but still physical */
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_online_map);
-
 /* Bitmask of CPUs present in the system - exported by i386_syms.c, used
  * by scheduler but indexed physically */
 cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
@@ -218,8 +213,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 /* This is for the new dynamic CPU boot code */
 cpumask_t cpu_callin_map = CPU_MASK_NONE;
 cpumask_t cpu_callout_map = CPU_MASK_NONE;
-cpumask_t cpu_possible_map = CPU_MASK_NONE;
-EXPORT_SYMBOL(cpu_possible_map);
 
 /* The per processor IRQ masks (these are usually kept in sync) */
 static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -679,7 +672,7 @@ void __init smp_boot_cpus(void)
 
        /* loop over all the extended VIC CPUs and boot them.  The
         * Quad CPUs must be bootstrapped by their extended VIC cpu */
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map))
                        continue;
                do_boot_cpu(i);
index fea4565ff576b9f52f76d9286ece00ed52a4f141..d8cc96a2738f739a9f5dba76f77ee8c9a0885e6a 100644 (file)
@@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP)        += dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)          += highmem_32.o
 
-obj-$(CONFIG_MMIOTRACE_HOOKS)  += kmmio.o
 obj-$(CONFIG_MMIOTRACE)                += mmiotrace.o
-mmiotrace-y                    := pf_in.o mmio-mod.o
+mmiotrace-y                    := kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)   += testmmiotrace.o
 
 obj-$(CONFIG_NUMA)             += numa_$(BITS).o
index 31e8730fa2463214f36c2f6b3df9d0f75f6be346..21e996a70d68f9a85cc5abc88bc36f2f0d43b1a1 100644 (file)
@@ -53,7 +53,7 @@
 
 static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
-#ifdef CONFIG_MMIOTRACE_HOOKS
+#ifdef CONFIG_MMIOTRACE
        if (unlikely(is_kmmio_active()))
                if (kmmio_handler(regs, addr) == 1)
                        return -1;
@@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
                                 unsigned long error_code)
 {
        unsigned long flags = oops_begin();
+       int sig = SIGKILL;
        struct task_struct *tsk;
 
        printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
        tsk->thread.trap_no = 14;
        tsk->thread.error_code = error_code;
        if (__die("Bad pagetable", regs, error_code))
-               regs = NULL;
-       oops_end(flags, regs, SIGKILL);
+               sig = 0;
+       oops_end(flags, regs, sig);
 }
 #endif
 
@@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
        int fault;
 #ifdef CONFIG_X86_64
        unsigned long flags;
+       int sig;
 #endif
 
        tsk = current;
@@ -849,11 +851,12 @@ no_context:
        bust_spinlocks(0);
        do_exit(SIGKILL);
 #else
+       sig = SIGKILL;
        if (__die("Oops", regs, error_code))
-               regs = NULL;
+               sig = 0;
        /* Executive summary in case the body of the oops scrolled away */
        printk(KERN_EMERG "CR2: %016lx\n", address);
-       oops_end(flags, regs, SIGKILL);
+       oops_end(flags, regs, sig);
 #endif
 
 /*
index cebcbf152d46b06b725b525debc573f3d91da71e..71a14f89f89e7367c72f2de85da6708acc83c069 100644 (file)
@@ -278,7 +278,7 @@ void __init numa_init_array(void)
        int rr, i;
 
        rr = first_node(node_online_map);
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                if (early_cpu_to_node(i) != NUMA_NO_NODE)
                        continue;
                numa_set_node(i, rr);
@@ -549,7 +549,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
        memnodemap[0] = 0;
        node_set_online(0);
        node_set(0, node_possible_map);
-       for (i = 0; i < NR_CPUS; i++)
+       for (i = 0; i < nr_cpu_ids; i++)
                numa_set_node(i, 0);
        e820_register_active_regions(0, start_pfn, last_pfn);
        setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
index 51c0a2fc14fe05521def4af57c0e2d1d29c8359d..09737c8af07479020e7183fef611c376e4fc1dad 100644 (file)
@@ -382,7 +382,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                if (!node_online(i))
                        setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                int node = early_cpu_to_node(i);
 
                if (node == NUMA_NO_NODE)
index 9915293500fb69ebdae0b8d80b69155f180f1274..9a5af6c8fbe9fc445fdca3358b90666c9ecae7bb 100644 (file)
@@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 
 #undef PCI_CONF2_ADDRESS
 
-static struct pci_raw_ops pci_direct_conf2 = {
+struct pci_raw_ops pci_direct_conf2 = {
        .read =         pci_conf2_read,
        .write =        pci_conf2_write,
 };
@@ -289,6 +289,7 @@ int __init pci_direct_probe(void)
 
        if (pci_check_type1()) {
                raw_pci_ops = &pci_direct_conf1;
+               port_cf9_safe = true;
                return 1;
        }
        release_resource(region);
@@ -305,6 +306,7 @@ int __init pci_direct_probe(void)
 
        if (pci_check_type2()) {
                raw_pci_ops = &pci_direct_conf2;
+               port_cf9_safe = true;
                return 2;
        }
 
index 15b9cf6be729c0c7cddee3ff54d1ce9809f32d58..1959018aac025ea3a38048973e6cc7a107396684 100644 (file)
@@ -96,6 +96,7 @@ extern struct pci_raw_ops *raw_pci_ops;
 extern struct pci_raw_ops *raw_pci_ext_ops;
 
 extern struct pci_raw_ops pci_direct_conf1;
+extern bool port_cf9_safe;
 
 /* arch_initcall level */
 extern int pci_direct_probe(void);
index 1ef0f90813d626ed6be436b93d3d5b6550dbb392..d9d35824c56f30e56266445cdf9ed5877f94bf52 100644 (file)
@@ -9,6 +9,9 @@
  * Also alternative() doesn't work.
  */
 
+/* Disable profiling for userspace code: */
+#define DISABLE_BRANCH_PROFILING
+
 #include <linux/kernel.h>
 #include <linux/posix-timers.h>
 #include <linux/time.h>
index 636ef4caa52d3dee2a24fc3b1913fe23b8ab34db..e59e53b11e2b3d725a6920d2b606cfc05805f705 100644 (file)
@@ -1079,7 +1079,7 @@ static void drop_other_mm_ref(void *info)
 
 static void xen_drop_mm_ref(struct mm_struct *mm)
 {
-       cpumask_t mask;
+       cpumask_var_t mask;
        unsigned cpu;
 
        if (current->active_mm == mm) {
@@ -1091,7 +1091,16 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
        }
 
        /* Get the "official" set of cpus referring to our pagetable. */
-       mask = mm->cpu_vm_mask;
+       if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+               for_each_online_cpu(cpu) {
+                       if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+                           && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+                               continue;
+                       smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+               }
+               return;
+       }
+       cpumask_copy(mask, &mm->cpu_vm_mask);
 
        /* It's possible that a vcpu may have a stale reference to our
           cr3, because its in lazy mode, and it hasn't yet flushed
@@ -1100,11 +1109,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
           if needed. */
        for_each_online_cpu(cpu) {
                if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
-                       cpu_set(cpu, mask);
+                       cpumask_set_cpu(cpu, mask);
        }
 
-       if (!cpus_empty(mask))
-               smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+       if (!cpumask_empty(mask))
+               smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+       free_cpumask_var(mask);
 }
 #else
 static void xen_drop_mm_ref(struct mm_struct *mm)
index acd9b6705e024f0833614f209caf2ab0da62807a..c44e2069c7c78078ab7ee895acf817689de48443 100644 (file)
@@ -33,7 +33,7 @@
 #include "xen-ops.h"
 #include "mmu.h"
 
-cpumask_t xen_cpu_initialized_map;
+cpumask_var_t xen_cpu_initialized_map;
 
 static DEFINE_PER_CPU(int, resched_irq);
 static DEFINE_PER_CPU(int, callfunc_irq);
@@ -158,7 +158,7 @@ static void __init xen_fill_possible_map(void)
 {
        int i, rc;
 
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
                if (rc >= 0) {
                        num_processors++;
@@ -192,11 +192,14 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
        if (xen_smp_intr_init(0))
                BUG();
 
-       xen_cpu_initialized_map = cpumask_of_cpu(0);
+       if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
+               panic("could not allocate xen_cpu_initialized_map\n");
+
+       cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
 
        /* Restrict the possible_map according to max_cpus. */
        while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
-               for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--)
+               for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
                        continue;
                cpu_clear(cpu, cpu_possible_map);
        }
@@ -221,7 +224,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
        struct vcpu_guest_context *ctxt;
        struct desc_struct *gdt;
 
-       if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
+       if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
                return 0;
 
        ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
@@ -408,24 +411,23 @@ static void xen_smp_send_reschedule(int cpu)
        xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
 
-static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
+static void xen_send_IPI_mask(const struct cpumask *mask,
+                             enum ipi_vector vector)
 {
        unsigned cpu;
 
-       cpus_and(mask, mask, cpu_online_map);
-
-       for_each_cpu_mask_nr(cpu, mask)
+       for_each_cpu_and(cpu, mask, cpu_online_mask)
                xen_send_IPI_one(cpu, vector);
 }
 
-static void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 {
        int cpu;
 
        xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
 
        /* Make sure other vcpus get a chance to run if they need to. */
-       for_each_cpu_mask_nr(cpu, mask) {
+       for_each_cpu(cpu, mask) {
                if (xen_vcpu_stolen(cpu)) {
                        HYPERVISOR_sched_op(SCHEDOP_yield, 0);
                        break;
@@ -435,7 +437,8 @@ static void xen_smp_send_call_function_ipi(cpumask_t mask)
 
 static void xen_smp_send_call_function_single_ipi(int cpu)
 {
-       xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
+       xen_send_IPI_mask(cpumask_of(cpu),
+                         XEN_CALL_FUNCTION_SINGLE_VECTOR);
 }
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
index 2a234db5949beb39e21e5fa09089c3740d802940..212ffe012b76656231834f2a9026db28d289e080 100644 (file)
@@ -35,7 +35,8 @@ void xen_post_suspend(int suspend_cancelled)
                        pfn_to_mfn(xen_start_info->console.domU.mfn);
        } else {
 #ifdef CONFIG_SMP
-               xen_cpu_initialized_map = cpu_online_map;
+               BUG_ON(xen_cpu_initialized_map == NULL);
+               cpumask_copy(xen_cpu_initialized_map, cpu_online_mask);
 #endif
                xen_vcpu_restore();
        }
index c9f7cda48ed78ecbe1b421c540bbe2b777681400..65d75a6be0ba586eeaf22faffd466b1ec109eb5d 100644 (file)
@@ -437,7 +437,7 @@ void xen_setup_timer(int cpu)
        evt = &per_cpu(xen_clock_events, cpu);
        memcpy(evt, xen_clockevent, sizeof(*evt));
 
-       evt->cpumask = cpumask_of_cpu(cpu);
+       evt->cpumask = cpumask_of(cpu);
        evt->irq = irq;
 
        setup_runstate_info(cpu);
index 9e1afae8461f1eabc04e625c06d9ef329a73b8f2..c1f8faf0a2c5a49d0676e24aaad161323bec6135 100644 (file)
@@ -58,7 +58,7 @@ void __init xen_init_spinlocks(void);
 __cpuinit void xen_init_lock_cpu(int cpu);
 void xen_uninit_lock_cpu(int cpu);
 
-extern cpumask_t xen_cpu_initialized_map;
+extern cpumask_var_t xen_cpu_initialized_map;
 #else
 static inline void xen_smp_init(void) {}
 #endif
index 1ab7c15c8d7a58bc2df639a759a0b719bb2c25da..290b219fad9c5a1b825d495c8ea427bda7b04c5c 100644 (file)
@@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE
        depends on SYSFS
        select RELAY
        select DEBUG_FS
+       select TRACEPOINTS
        help
          Say Y here if you want to be able to trace the block layer actions
          on a given queue. Tracing allows you to see any traffic happening
index c36aa98fafa3e4ed94adcf3e462945d59b0c1fcd..561e8a1b43a4a526e405905c69cf8f0e15a4df27 100644 (file)
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
+#include <trace/block.h>
 
 #include "blk.h"
 
+DEFINE_TRACE(block_plug);
+DEFINE_TRACE(block_unplug_io);
+DEFINE_TRACE(block_unplug_timer);
+DEFINE_TRACE(block_getrq);
+DEFINE_TRACE(block_sleeprq);
+DEFINE_TRACE(block_rq_requeue);
+DEFINE_TRACE(block_bio_backmerge);
+DEFINE_TRACE(block_bio_frontmerge);
+DEFINE_TRACE(block_bio_queue);
+DEFINE_TRACE(block_rq_complete);
+DEFINE_TRACE(block_remap);     /* Also used in drivers/md/dm.c */
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+
 static int __make_request(struct request_queue *q, struct bio *bio);
 
 /*
@@ -205,7 +219,7 @@ void blk_plug_device(struct request_queue *q)
 
        if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
                mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-               blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+               trace_block_plug(q);
        }
 }
 EXPORT_SYMBOL(blk_plug_device);
@@ -292,9 +306,7 @@ void blk_unplug_work(struct work_struct *work)
        struct request_queue *q =
                container_of(work, struct request_queue, unplug_work);
 
-       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-                               q->rq.count[READ] + q->rq.count[WRITE]);
-
+       trace_block_unplug_io(q);
        q->unplug_fn(q);
 }
 
@@ -302,9 +314,7 @@ void blk_unplug_timeout(unsigned long data)
 {
        struct request_queue *q = (struct request_queue *)data;
 
-       blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
-                               q->rq.count[READ] + q->rq.count[WRITE]);
-
+       trace_block_unplug_timer(q);
        kblockd_schedule_work(q, &q->unplug_work);
 }
 
@@ -314,9 +324,7 @@ void blk_unplug(struct request_queue *q)
         * devices don't necessarily have an ->unplug_fn defined
         */
        if (q->unplug_fn) {
-               blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
-                                       q->rq.count[READ] + q->rq.count[WRITE]);
-
+               trace_block_unplug_io(q);
                q->unplug_fn(q);
        }
 }
@@ -822,7 +830,7 @@ rq_starved:
        if (ioc_batching(q, ioc))
                ioc->nr_batch_requests--;
 
-       blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+       trace_block_getrq(q, bio, rw);
 out:
        return rq;
 }
@@ -848,7 +856,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
                prepare_to_wait_exclusive(&rl->wait[rw], &wait,
                                TASK_UNINTERRUPTIBLE);
 
-               blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+               trace_block_sleeprq(q, bio, rw);
 
                __generic_unplug_device(q);
                spin_unlock_irq(q->queue_lock);
@@ -928,7 +936,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
        blk_delete_timer(rq);
        blk_clear_rq_complete(rq);
-       blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+       trace_block_rq_requeue(q, rq);
 
        if (blk_rq_tagged(rq))
                blk_queue_end_tag(q, rq);
@@ -1167,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!ll_back_merge_fn(q, req, bio))
                        break;
 
-               blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+               trace_block_bio_backmerge(q, bio);
 
                req->biotail->bi_next = bio;
                req->biotail = bio;
@@ -1186,7 +1194,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
                if (!ll_front_merge_fn(q, req, bio))
                        break;
 
-               blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+               trace_block_bio_frontmerge(q, bio);
 
                bio->bi_next = req->bio;
                req->bio = bio;
@@ -1269,7 +1277,7 @@ static inline void blk_partition_remap(struct bio *bio)
                bio->bi_sector += p->start_sect;
                bio->bi_bdev = bdev->bd_contains;
 
-               blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
+               trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
                                    bdev->bd_dev, bio->bi_sector,
                                    bio->bi_sector - p->start_sect);
        }
@@ -1441,10 +1449,10 @@ end_io:
                        goto end_io;
 
                if (old_sector != -1)
-                       blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
+                       trace_block_remap(q, bio, old_dev, bio->bi_sector,
                                            old_sector);
 
-               blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+               trace_block_bio_queue(q, bio);
 
                old_sector = bio->bi_sector;
                old_dev = bio->bi_bdev->bd_dev;
@@ -1678,7 +1686,7 @@ static int __end_that_request_first(struct request *req, int error,
        int total_bytes, bio_nbytes, next_idx = 0;
        struct bio *bio;
 
-       blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+       trace_block_rq_complete(req->q, req);
 
        /*
         * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
index 85049a7e7a179a97c283eb4ebe6c1fe7285f80cf..b0a2cae886dbdc4dde9a2efa99345ea2462a54b0 100644 (file)
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <trace/block.h>
 #include <asm/uaccess.h>
 
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
 /*
  * Send out a notify message.
  */
@@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
  * The worker for the various blk_add_trace*() types. Fills out a
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
-void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
                     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
        struct task_struct *tsk = current;
@@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        local_irq_restore(flags);
 }
 
-EXPORT_SYMBOL_GPL(__blk_add_trace);
-
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
@@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt)
        free_percpu(bt->sequence);
        free_percpu(bt->msg_data);
        kfree(bt);
+       mutex_lock(&blk_probe_mutex);
+       if (atomic_dec_and_test(&blk_probes_ref))
+               blk_unregister_tracepoints();
+       mutex_unlock(&blk_probe_mutex);
 }
 
 int blk_trace_remove(struct request_queue *q)
@@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        bt->pid = buts->pid;
        bt->trace_state = Blktrace_setup;
 
+       mutex_lock(&blk_probe_mutex);
+       if (atomic_add_return(1, &blk_probes_ref) == 1) {
+               ret = blk_register_tracepoints();
+               if (ret)
+                       goto probe_err;
+       }
+       mutex_unlock(&blk_probe_mutex);
+
        ret = -EBUSY;
        old_bt = xchg(&q->blk_trace, bt);
        if (old_bt) {
@@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        }
 
        return 0;
+probe_err:
+       atomic_dec(&blk_probes_ref);
+       mutex_unlock(&blk_probe_mutex);
 err:
        if (dir)
                blk_remove_tree(dir);
@@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q)
                blk_trace_remove(q);
        }
 }
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:         queue the io is for
+ * @rq:                the source request
+ * @what:      the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+                                   u32 what)
+{
+       struct blk_trace *bt = q->blk_trace;
+       int rw = rq->cmd_flags & 0x03;
+
+       if (likely(!bt))
+               return;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq)) {
+               what |= BLK_TC_ACT(BLK_TC_PC);
+               __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+                               sizeof(rq->cmd), rq->cmd);
+       } else  {
+               what |= BLK_TC_ACT(BLK_TC_FS);
+               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+                               rw, what, rq->errors, 0, NULL);
+       }
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq)
+{
+       blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:         queue the io is for
+ * @bio:       the source bio
+ * @what:      the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+                                    u32 what)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (likely(!bt))
+               return;
+
+       __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+                       !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+       blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw)
+{
+       if (bio)
+               blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+       else {
+               struct blk_trace *bt = q->blk_trace;
+
+               if (bt)
+                       __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+       }
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw)
+{
+       if (bio)
+               blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+       else {
+               struct blk_trace *bt = q->blk_trace;
+
+               if (bt)
+                       __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL);
+       }
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt)
+               __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt) {
+               unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+               __be64 rpdu = cpu_to_be64(pdu);
+
+               __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+                               sizeof(rpdu), &rpdu);
+       }
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt) {
+               unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+               __be64 rpdu = cpu_to_be64(pdu);
+
+               __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+                               sizeof(rpdu), &rpdu);
+       }
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+                               unsigned int pdu)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (bt) {
+               __be64 rpdu = cpu_to_be64(pdu);
+
+               __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+                               BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+                               sizeof(rpdu), &rpdu);
+       }
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:         queue the io is for
+ * @bio:       the source bio
+ * @dev:       target device
+ * @from:      source sector
+ * @to:                target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+                                      dev_t dev, sector_t from, sector_t to)
+{
+       struct blk_trace *bt = q->blk_trace;
+       struct blk_io_trace_remap r;
+
+       if (likely(!bt))
+               return;
+
+       r.device = cpu_to_be32(dev);
+       r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+       r.sector = cpu_to_be64(to);
+
+       __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+                       !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:         queue the io is for
+ * @rq:                io request
+ * @data:      driver-specific data
+ * @len:       length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+                        struct request *rq,
+                        void *data, size_t len)
+{
+       struct blk_trace *bt = q->blk_trace;
+
+       if (likely(!bt))
+               return;
+
+       if (blk_pc_request(rq))
+               __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+                               rq->errors, len, data);
+       else
+               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+                               0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+       int ret;
+
+       ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+       WARN_ON(ret);
+       ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+       WARN_ON(ret);
+       ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+       WARN_ON(ret);
+       ret = register_trace_block_getrq(blk_add_trace_getrq);
+       WARN_ON(ret);
+       ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+       WARN_ON(ret);
+       ret = register_trace_block_plug(blk_add_trace_plug);
+       WARN_ON(ret);
+       ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+       WARN_ON(ret);
+       ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+       WARN_ON(ret);
+       ret = register_trace_block_split(blk_add_trace_split);
+       WARN_ON(ret);
+       ret = register_trace_block_remap(blk_add_trace_remap);
+       WARN_ON(ret);
+       return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+       unregister_trace_block_remap(blk_add_trace_remap);
+       unregister_trace_block_split(blk_add_trace_split);
+       unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+       unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+       unregister_trace_block_plug(blk_add_trace_plug);
+       unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+       unregister_trace_block_getrq(blk_add_trace_getrq);
+       unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+       unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+       unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+       unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+       unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+       unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+       unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+       unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+       unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+       unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+       tracepoint_synchronize_unregister();
+}
index a6951f76ba0c3fa3e9a0c9f8ff58a108e63dc85c..86836dd179c05476078719869a0481777e999bc1 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
@@ -41,6 +42,8 @@
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
+DEFINE_TRACE(block_rq_abort);
+
 /*
  * Merge hash stuff.
  */
@@ -52,6 +55,9 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)                ((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)                (!hlist_unhashed(&(rq)->hash))
 
+DEFINE_TRACE(block_rq_insert);
+DEFINE_TRACE(block_rq_issue);
+
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
@@ -586,7 +592,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
        unsigned ordseq;
        int unplug_it = 1;
 
-       blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+       trace_block_rq_insert(q, rq);
 
        rq->q = q;
 
@@ -772,7 +778,7 @@ struct request *elv_next_request(struct request_queue *q)
                         * not be passed by new incoming requests
                         */
                        rq->cmd_flags |= REQ_STARTED;
-                       blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+                       trace_block_rq_issue(q, rq);
                }
 
                if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -914,7 +920,7 @@ void elv_abort_queue(struct request_queue *q)
        while (!list_empty(&q->queue_head)) {
                rq = list_entry_rq(q->queue_head.next);
                rq->cmd_flags |= REQ_QUIET;
-               blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+               trace_block_rq_abort(q, rq);
                __blk_end_request(rq, -EIO, blk_rq_bytes(rq));
        }
 }
index 64f5d54f7edcee095ae8ead0a2f29d032ce546cd..4259072f5bd0960e113bb97d0f8aaa1818a8ab58 100644 (file)
@@ -109,7 +109,7 @@ static SYSDEV_ATTR(crash_notes, 0400, show_crash_notes, NULL);
  */
 static ssize_t print_cpus_map(char *buf, cpumask_t *map)
 {
-       int n = cpulist_scnprintf(buf, PAGE_SIZE-2, *map);
+       int n = cpulist_scnprintf(buf, PAGE_SIZE-2, map);
 
        buf[n++] = '\n';
        buf[n] = '\0';
index f5207090885a8fc80f28b670b95a6fa13ca87418..91636cd8b6c9ad2e12c2bb0a4925397b6b0c454b 100644 (file)
@@ -30,8 +30,8 @@ static ssize_t node_read_cpumap(struct sys_device *dev, int type, char *buf)
        BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
 
        len = type?
-               cpulist_scnprintf(buf, PAGE_SIZE-2, *mask):
-               cpumask_scnprintf(buf, PAGE_SIZE-2, *mask);
+               cpulist_scnprintf(buf, PAGE_SIZE-2, mask) :
+               cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
        buf[len++] = '\n';
        buf[len] = '\0';
        return len;
index 199cd97e32e64b2c56338b2d74eb68ab5ce01de3..a8bc1cbcfa7cb2bb9c5e00fe68b17f0644749026 100644 (file)
@@ -49,8 +49,8 @@ static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
 
        if (len > 1) {
                n = type?
-                       cpulist_scnprintf(buf, len-2, *mask):
-                       cpumask_scnprintf(buf, len-2, *mask);
+                       cpulist_scnprintf(buf, len-2, mask) :
+                       cpumask_scnprintf(buf, len-2, mask);
                buf[n++] = '\n';
                buf[n] = '\0';
        }
index 675076f5fca881d0ab3752b44c8356766dac1c3a..d26891bfcd4154effba426421322ed053acb9d9b 100644 (file)
@@ -558,23 +558,9 @@ struct timer_rand_state {
        unsigned dont_count_entropy:1;
 };
 
-static struct timer_rand_state *irq_timer_state[NR_IRQS];
-
-static struct timer_rand_state *get_timer_rand_state(unsigned int irq)
-{
-       if (irq >= nr_irqs)
-               return NULL;
-
-       return irq_timer_state[irq];
-}
-
-static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
-{
-       if (irq >= nr_irqs)
-               return;
-
-       irq_timer_state[irq] = state;
-}
+#ifndef CONFIG_SPARSE_IRQ
+struct timer_rand_state *irq_timer_state[NR_IRQS];
+#endif
 
 static struct timer_rand_state input_timer_state;
 
@@ -933,8 +919,10 @@ void rand_initialize_irq(int irq)
 {
        struct timer_rand_state *state;
 
+#ifndef CONFIG_SPARSE_IRQ
        if (irq >= nr_irqs)
                return;
+#endif
 
        state = get_timer_rand_state(irq);
 
index ce0d9da52a8ab808a24e8d30bff27d631b474713..94966edfb44dedc340d4d81f230c43a0aec9b7bf 100644 (file)
@@ -274,6 +274,22 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
        .enable_mask    = SYSRQ_ENABLE_DUMP,
 };
 
+#ifdef CONFIG_TRACING
+#include <linux/ftrace.h>
+
+static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
+{
+       ftrace_dump();
+}
+static struct sysrq_key_op sysrq_ftrace_dump_op = {
+       .handler        = sysrq_ftrace_dump,
+       .help_msg       = "dumpZ-ftrace-buffer",
+       .action_msg     = "Dump ftrace buffer",
+       .enable_mask    = SYSRQ_ENABLE_DUMP,
+};
+#else
+#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0)
+#endif
 
 static void sysrq_handle_showmem(int key, struct tty_struct *tty)
 {
@@ -406,7 +422,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
        NULL,                           /* x */
        /* y: May be registered on sparc64 for global register dump */
        NULL,                           /* y */
-       NULL                            /* z */
+       &sysrq_ftrace_dump_op,          /* z */
 };
 
 /* key2index calculation, -1 on invalid index */
index f450588e5858884ea16964f407b825de5c673cc7..254f1064d97354993c44b6e1185d23d4bd286aab 100644 (file)
@@ -154,7 +154,6 @@ static struct tc_clkevt_device clkevt = {
                .shift          = 32,
                /* Should be lower than at91rm9200's system timer */
                .rating         = 125,
-               .cpumask        = CPU_MASK_CPU0,
                .set_next_event = tc_next_event,
                .set_mode       = tc_mode,
        },
@@ -195,6 +194,7 @@ static void __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx)
        clkevt.clkevt.max_delta_ns
                = clockevent_delta2ns(0xffff, &clkevt.clkevt);
        clkevt.clkevt.min_delta_ns = clockevent_delta2ns(1, &clkevt.clkevt) + 1;
+       clkevt.clkevt.cpumask = cpumask_of(0);
 
        setup_irq(irq, &tc_irqaction);
 
index c99e4728ff4162ed16c4a7ec99fdc0c27c8cb35f..343094c3feeb834c55f8aa1ecd3baa57890eca1d 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -51,6 +52,8 @@ struct dm_target_io {
        union map_info info;
 };
 
+DEFINE_TRACE(block_bio_complete);
+
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
        if (bio && bio->bi_private)
@@ -504,8 +507,7 @@ static void dec_pending(struct dm_io *io, int error)
                end_io_acct(io);
 
                if (io->error != DM_ENDIO_REQUEUE) {
-                       blk_add_trace_bio(io->md->queue, io->bio,
-                                         BLK_TA_COMPLETE);
+                       trace_block_bio_complete(io->md->queue, io->bio);
 
                        bio_endio(io->bio, io->error);
                }
@@ -598,7 +600,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
        if (r == DM_MAPIO_REMAPPED) {
                /* the bio has been remapped so dispatch it */
 
-               blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
+               trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
                                    tio->io->bio->bi_bdev->bd_dev,
                                    clone->bi_sector, sector);
 
index 7beffcab274548545c15673025cbd639ce563094..9dedbbd218c3967932a83f866d5338034f664457 100644 (file)
@@ -704,16 +704,17 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq, cpumask_t dest)
+static void iosapic_set_affinity_irq(unsigned int irq,
+                                    const struct cpumask *dest)
 {
        struct vector_info *vi = iosapic_get_vector(irq);
        u32 d0, d1, dummy_d0;
        unsigned long flags;
 
-       if (cpu_check_affinity(irq, &dest))
+       if (cpu_check_affinity(irq, dest))
                return;
 
-       vi->txn_addr = txn_affinity_addr(irq, first_cpu(dest));
+       vi->txn_addr = txn_affinity_addr(irq, cpumask_first(dest));
 
        spin_lock_irqsave(&iosapic_lock, flags);
        /* d1 contains the destination CPU, so only want to set that
index 2de5a3238c947be89213119d9cf468d2fa59e11d..f78371b2252976efb8ab86c7eaf87a7240949ea5 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/pci.h>
 #include <linux/irq.h>
 #include <asm/io_apic.h>
+#include <asm/smp.h>
 #include <linux/intel-iommu.h>
 #include "intr_remapping.h"
 
@@ -19,17 +20,75 @@ struct irq_2_iommu {
        u8  irte_mask;
 };
 
-static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+{
+       struct irq_2_iommu *iommu;
+       int node;
+
+       node = cpu_to_node(cpu);
+
+       iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+
+       return iommu;
+}
 
 static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 {
-       return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL;
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+
+       if (WARN_ON_ONCE(!desc))
+               return NULL;
+
+       return desc->irq_2_iommu;
+}
+
+static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+{
+       struct irq_desc *desc;
+       struct irq_2_iommu *irq_iommu;
+
+       /*
+        * alloc irq desc if not allocated already.
+        */
+       desc = irq_to_desc_alloc_cpu(irq, cpu);
+       if (!desc) {
+               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+               return NULL;
+       }
+
+       irq_iommu = desc->irq_2_iommu;
+
+       if (!irq_iommu)
+               desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+
+       return desc->irq_2_iommu;
 }
 
+static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
+{
+       return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+}
+
+#else /* !CONFIG_SPARSE_IRQ */
+
+static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
+
+static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
+{
+       if (irq < nr_irqs)
+               return &irq_2_iommuX[irq];
+
+       return NULL;
+}
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
        return irq_2_iommu(irq);
 }
+#endif
 
 static DEFINE_SPINLOCK(irq_2_ir_lock);
 
@@ -86,9 +145,11 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
        if (!count)
                return -1;
 
+#ifndef CONFIG_SPARSE_IRQ
        /* protect irq_2_iommu_alloc later */
        if (irq >= nr_irqs)
                return -1;
+#endif
 
        /*
         * start the IRTE search from index 0.
@@ -130,6 +191,12 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
                table->base[i].present = 1;
 
        irq_iommu = irq_2_iommu_alloc(irq);
+       if (!irq_iommu) {
+               spin_unlock(&irq_2_ir_lock);
+               printk(KERN_ERR "can't allocate irq_2_iommu\n");
+               return -1;
+       }
+
        irq_iommu->iommu = iommu;
        irq_iommu->irte_index =  index;
        irq_iommu->sub_handle = 0;
@@ -177,6 +244,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
 
        irq_iommu = irq_2_iommu_alloc(irq);
 
+       if (!irq_iommu) {
+               spin_unlock(&irq_2_ir_lock);
+               printk(KERN_ERR "can't allocate irq_2_iommu\n");
+               return -1;
+       }
+
        irq_iommu->iommu = iommu;
        irq_iommu->irte_index = index;
        irq_iommu->sub_handle = subhandle;
index 74801f7df9c901ad21017ce2a831e0cd61c8af1c..11a51f8ed3b3b96ffa72820f195c6a7afa4bfa40 100644 (file)
@@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_dev *dev, int enable)
        }
 }
 
-static void msix_flush_writes(unsigned int irq)
+static void msix_flush_writes(struct irq_desc *desc)
 {
        struct msi_desc *entry;
 
-       entry = get_irq_msi(irq);
+       entry = get_irq_desc_msi(desc);
        BUG_ON(!entry || !entry->dev);
        switch (entry->msi_attrib.type) {
        case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned int irq)
  * Returns 1 if it succeeded in masking the interrupt and 0 if the device
  * doesn't support MSI masking.
  */
-static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
+static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
 {
        struct msi_desc *entry;
 
-       entry = get_irq_msi(irq);
+       entry = get_irq_desc_msi(desc);
        BUG_ON(!entry || !entry->dev);
        switch (entry->msi_attrib.type) {
        case PCI_CAP_ID_MSI:
@@ -172,9 +172,9 @@ static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
        return 1;
 }
 
-void read_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
 {
-       struct msi_desc *entry = get_irq_msi(irq);
+       struct msi_desc *entry = get_irq_desc_msi(desc);
        switch(entry->msi_attrib.type) {
        case PCI_CAP_ID_MSI:
        {
@@ -211,9 +211,16 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
        }
 }
 
-void write_msi_msg(unsigned int irq, struct msi_msg *msg)
+void read_msi_msg(unsigned int irq, struct msi_msg *msg)
 {
-       struct msi_desc *entry = get_irq_msi(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       read_msi_msg_desc(desc, msg);
+}
+
+void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
+{
+       struct msi_desc *entry = get_irq_desc_msi(desc);
        switch (entry->msi_attrib.type) {
        case PCI_CAP_ID_MSI:
        {
@@ -252,21 +259,31 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
        entry->msg = *msg;
 }
 
+void write_msi_msg(unsigned int irq, struct msi_msg *msg)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       write_msi_msg_desc(desc, msg);
+}
+
 void mask_msi_irq(unsigned int irq)
 {
-       msi_set_mask_bits(irq, 1, 1);
-       msix_flush_writes(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       msi_set_mask_bits(desc, 1, 1);
+       msix_flush_writes(desc);
 }
 
 void unmask_msi_irq(unsigned int irq)
 {
-       msi_set_mask_bits(irq, 1, 0);
-       msix_flush_writes(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       msi_set_mask_bits(desc, 1, 0);
+       msix_flush_writes(desc);
 }
 
 static int msi_free_irqs(struct pci_dev* dev);
 
-
 static struct msi_desc* alloc_msi_entry(void)
 {
        struct msi_desc *entry;
@@ -303,9 +320,11 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
        pci_intx_for_msi(dev, 0);
        msi_set_enable(dev, 0);
        write_msi_msg(dev->irq, &entry->msg);
-       if (entry->msi_attrib.maskbit)
-               msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask,
+       if (entry->msi_attrib.maskbit) {
+               struct irq_desc *desc = irq_to_desc(dev->irq);
+               msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
                                  entry->msi_attrib.masked);
+       }
 
        pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
        control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +346,9 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
        msix_set_enable(dev, 0);
 
        list_for_each_entry(entry, &dev->msi_list, list) {
+               struct irq_desc *desc = irq_to_desc(entry->irq);
                write_msi_msg(entry->irq, &entry->msg);
-               msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked);
+               msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
        }
 
        BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +616,8 @@ void pci_msi_shutdown(struct pci_dev* dev)
        /* Return the the pci reset with msi irqs unmasked */
        if (entry->msi_attrib.maskbit) {
                u32 mask = entry->msi_attrib.maskbits_mask;
-               msi_set_mask_bits(dev->irq, mask, ~mask);
+               struct irq_desc *desc = irq_to_desc(dev->irq);
+               msi_set_mask_bits(desc, mask, ~mask);
        }
        if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
                return;
index 5d72866897a8afc3397fe7b599b74de64eecbe22..c88485860a0ad3dfff19485a417aaf112b051f08 100644 (file)
@@ -74,7 +74,7 @@ static ssize_t local_cpus_show(struct device *dev,
        int len;
 
        mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
-       len = cpumask_scnprintf(buf, PAGE_SIZE-2, mask);
+       len = cpumask_scnprintf(buf, PAGE_SIZE-2, &mask);
        buf[len++] = '\n';
        buf[len] = '\0';
        return len;
@@ -88,7 +88,7 @@ static ssize_t local_cpulist_show(struct device *dev,
        int len;
 
        mask = pcibus_to_cpumask(to_pci_dev(dev)->bus);
-       len = cpulist_scnprintf(buf, PAGE_SIZE-2, mask);
+       len = cpulist_scnprintf(buf, PAGE_SIZE-2, &mask);
        buf[len++] = '\n';
        buf[len] = '\0';
        return len;
index 003a9b3c293fd6937682d061f2b4e242ab673adc..5b3f5937ecf556429217afa10d58b0053f273269 100644 (file)
@@ -55,8 +55,8 @@ static ssize_t pci_bus_show_cpuaffinity(struct device *dev,
 
        cpumask = pcibus_to_cpumask(to_pci_bus(dev));
        ret = type?
-               cpulist_scnprintf(buf, PAGE_SIZE-2, cpumask):
-               cpumask_scnprintf(buf, PAGE_SIZE-2, cpumask);
+               cpulist_scnprintf(buf, PAGE_SIZE-2, &cpumask) :
+               cpumask_scnprintf(buf, PAGE_SIZE-2, &cpumask);
        buf[ret++] = '\n';
        buf[ret] = '\0';
        return ret;
index 1e3b934a4cf707cbe06084f21efb39874d4802f4..6c8193046e0de5f463b44fd1737a8c284cb4c895 100644 (file)
@@ -141,8 +141,12 @@ static void init_evtchn_cpu_bindings(void)
        int i;
 
        /* By default all event channels notify CPU#0. */
-       for_each_irq_desc(i, desc)
+       for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
+
                desc->affinity = cpumask_of_cpu(0);
+       }
 #endif
 
        memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +235,7 @@ static int find_unbound_irq(void)
        int irq;
 
        /* Only allocate from dynirq range */
-       for_each_irq_nr(irq)
+       for (irq = 0; irq < nr_irqs; irq++)
                if (irq_bindcount[irq] == 0)
                        break;
 
@@ -579,7 +583,7 @@ void rebind_evtchn_irq(int evtchn, int irq)
        spin_unlock(&irq_mapping_update_lock);
 
        /* new event channels are always bound to cpu 0 */
-       irq_set_affinity(irq, cpumask_of_cpu(0));
+       irq_set_affinity(irq, cpumask_of(0));
 
        /* Unmask the event channel. */
        enable_irq(irq);
@@ -608,9 +612,9 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 }
 
 
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
+static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
 {
-       unsigned tcpu = first_cpu(dest);
+       unsigned tcpu = cpumask_first(dest);
        rebind_irq_to_cpu(irq, tcpu);
 }
 
@@ -792,7 +796,7 @@ void xen_irq_resume(void)
                mask_evtchn(evtchn);
 
        /* No IRQ <-> event-channel mappings. */
-       for_each_irq_nr(irq)
+       for (irq = 0; irq < nr_irqs; irq++)
                irq_info[irq].evtchn = 0; /* zap event-channel binding */
 
        for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +828,7 @@ void __init xen_init_IRQ(void)
                mask_evtchn(i);
 
        /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
-       for_each_irq_nr(i)
+       for (i = 0; i < nr_irqs; i++)
                irq_bindcount[i] = 0;
 
        irq_ctx_init(smp_processor_id());
index 77a55bcceedbc6afc79f7a081c0f0af5e7c4f46e..df99c882b807549f25c30b13416a3ae8ca43d7e0 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <scsi/sg.h>           /* for struct sg_iovec */
 
+DEFINE_TRACE(block_split);
+
 static struct kmem_cache *bio_slab __read_mostly;
 
 static mempool_t *bio_split_pool __read_mostly;
@@ -1263,7 +1266,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
        if (!bp)
                return bp;
 
-       blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+       trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                bi->bi_sector + first_sectors);
 
        BUG_ON(bi->bi_vcnt != 1);
index 81904f07679d43635cbc6241162d9b45d9a9db62..3bb1cf1e742552ad2f97b6b6206474937ffb0f07 100644 (file)
@@ -44,10 +44,13 @@ static int show_stat(struct seq_file *p, void *v)
                softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
                steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
                guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
-               for_each_irq_nr(j)
+               for_each_irq_nr(j) {
+#ifdef CONFIG_SPARSE_IRQ
+                       if (!irq_to_desc(j))
+                               continue;
+#endif
                        sum += kstat_irqs_cpu(j, i);
-
+               }
                sum += arch_irq_stat_cpu(i);
        }
        sum += arch_irq_stat();
@@ -92,7 +95,12 @@ static int show_stat(struct seq_file *p, void *v)
        /* sum again ? it could be updated? */
        for_each_irq_nr(j) {
                per_irq_sum = 0;
-
+#ifdef CONFIG_SPARSE_IRQ
+               if (!irq_to_desc(j)) {
+                       seq_printf(p, " %u", per_irq_sum);
+                       continue;
+               }
+#endif
                for_each_possible_cpu(i)
                        per_irq_sum += kstat_irqs_cpu(j, i);
 
index eba2eabcd2b86a40495845053d9d647e228540f9..16c211558c2253f38b51af2f9eca76a2d52cb19f 100644 (file)
@@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)
 }
 EXPORT_SYMBOL(seq_printf);
 
-static char *mangle_path(char *s, char *p, char *esc)
+/**
+ *     mangle_path -   mangle and copy path to buffer beginning
+ *     @s: buffer start
+ *     @p: beginning of path in above buffer
+ *     @esc: set of characters that need escaping
+ *
+ *      Copy the path from @p to @s, replacing each occurrence of character from
+ *      @esc with usual octal escape.
+ *      Returns pointer past last written character in @s, or NULL in case of
+ *      failure.
+ */
+char *mangle_path(char *s, char *p, char *esc)
 {
        while (s <= p) {
                char c = *p++;
@@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc)
        }
        return NULL;
 }
+EXPORT_SYMBOL(mangle_path);
 
 /*
  * return the absolute path of 'dentry' residing in mount 'mnt'.
index 54bbf6e04ee8f3bbd652c30254ef6553e38866c4..0e9e2bc0ee96e66d08beb37bb71933ab2ece7636 100644 (file)
@@ -40,6 +40,9 @@
 #ifndef node_to_cpumask
 #define node_to_cpumask(node)  ((void)node, cpu_online_map)
 #endif
+#ifndef cpumask_of_node
+#define cpumask_of_node(node)  ((void)node, cpu_online_mask)
+#endif
 #ifndef node_to_first_cpu
 #define node_to_first_cpu(node)        ((void)(node),0)
 #endif
                                )
 #endif
 
+#ifndef cpumask_of_pcibus
+#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ?            \
+                                cpu_all_mask :                         \
+                                cpumask_of_node(pcibus_to_node(bus)))
+#endif
+
 #endif /* CONFIG_NUMA */
 
-/* returns pointer to cpumask for specified node */
+/*
+ * returns pointer to cpumask for specified node
+ * Deprecated: use "const struct cpumask *mask = cpumask_of_node(node)"
+ */
 #ifndef node_to_cpumask_ptr
 
 #define        node_to_cpumask_ptr(v, node)                                    \
index 80744606bad172b57d2ab52dc01bb0bf33af293e..eba835a2c2cd5a39c295e700015a9497d69151f9 100644 (file)
 #define MCOUNT_REC()
 #endif
 
+#ifdef CONFIG_TRACE_BRANCH_PROFILING
+#define LIKELY_PROFILE()       VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
+                               *(_ftrace_annotated_branch)                           \
+                               VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+#define BRANCH_PROFILE()       VMLINUX_SYMBOL(__start_branch_profile) = .;   \
+                               *(_ftrace_branch)                             \
+                               VMLINUX_SYMBOL(__stop_branch_profile) = .;
+#else
+#define BRANCH_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA                                                      \
        *(.data)                                                        \
        VMLINUX_SYMBOL(__start___markers) = .;                          \
        *(__markers)                                                    \
        VMLINUX_SYMBOL(__stop___markers) = .;                           \
+       . = ALIGN(32);                                                  \
        VMLINUX_SYMBOL(__start___tracepoints) = .;                      \
        *(__tracepoints)                                                \
-       VMLINUX_SYMBOL(__stop___tracepoints) = .;
+       VMLINUX_SYMBOL(__stop___tracepoints) = .;                       \
+       LIKELY_PROFILE()                                                \
+       BRANCH_PROFILE()
 
 #define RO_DATA(align)                                                 \
        . = ALIGN((align));                                             \
index c5dd66916692ad1723fa29bd34089165afef582c..b96a6d2ffbc3f16161cb7fbc8e85de866dfeaaf3 100644 (file)
@@ -63,8 +63,6 @@ extern volatile int cpu_2_physid[NR_CPUS];
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
-extern cpumask_t cpu_possible_map;
-extern cpumask_t cpu_present_map;
 
 static __inline__ int hard_smp_processor_id(void)
 {
index 70a57c8c002b59e0aa73f9c2c1d443e31f3297b2..c980f5ba8de7dc01a2c34e7f64e42b84d0b6a744 100644 (file)
@@ -23,7 +23,7 @@
  */
 
 #if defined(CONFIG_FRAME_POINTER) || \
-       !defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER)
+       !defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
 #define M32R_PUSH_FP " push fp\n"
 #define M32R_POP_FP  " pop  fp\n"
 #else
index e531783e5d78db542fb4e86c602698bf5824d72b..95ac82340c3bc0914ad08e02bc337a17b0416e39 100644 (file)
@@ -313,6 +313,7 @@ unifdef-y += ptrace.h
 unifdef-y += qnx4_fs.h
 unifdef-y += quota.h
 unifdef-y += random.h
+unifdef-y += irqnr.h
 unifdef-y += reboot.h
 unifdef-y += reiserfs_fs.h
 unifdef-y += reiserfs_xattr.h
index bdf505d33e77c1ded7d9008b3f0bf6e3421d9890..1dba3493d520b56e4465aa59c37811b784fb80a8 100644 (file)
@@ -160,7 +160,6 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 extern int do_blk_trace_setup(struct request_queue *q,
        char *name, dev_t dev, struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
@@ -186,168 +185,8 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
        } while (0)
 #define BLK_TN_MAX_MSG         128
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:         queue the io is for
- * @rq:                the source request
- * @what:      the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-                                   u32 what)
-{
-       struct blk_trace *bt = q->blk_trace;
-       int rw = rq->cmd_flags & 0x03;
-
-       if (likely(!bt))
-               return;
-
-       if (blk_discard_rq(rq))
-               rw |= (1 << BIO_RW_DISCARD);
-
-       if (blk_pc_request(rq)) {
-               what |= BLK_TC_ACT(BLK_TC_PC);
-               __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-       } else  {
-               what |= BLK_TC_ACT(BLK_TC_FS);
-               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-       }
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:         queue the io is for
- * @bio:       the source bio
- * @what:      the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-                                    u32 what)
-{
-       struct blk_trace *bt = q->blk_trace;
-
-       if (likely(!bt))
-               return;
-
-       __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:         queue the io is for
- * @bio:       the source bio
- * @rw:                the data direction
- * @what:      the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-                                        struct bio *bio, int rw, u32 what)
-{
-       struct blk_trace *bt = q->blk_trace;
-
-       if (likely(!bt))
-               return;
-
-       if (bio)
-               blk_add_trace_bio(q, bio, what);
-       else
-               __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:         queue the io is for
- * @what:      the action
- * @bio:       the source bio
- * @pdu:       the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-                                        struct bio *bio, unsigned int pdu)
-{
-       struct blk_trace *bt = q->blk_trace;
-       __be64 rpdu = cpu_to_be64(pdu);
-
-       if (likely(!bt))
-               return;
-
-       if (bio)
-               __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-       else
-               __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:         queue the io is for
- * @bio:       the source bio
- * @dev:       target device
- * @from:      source sector
- * @to:                target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-                                      dev_t dev, sector_t from, sector_t to)
-{
-       struct blk_trace *bt = q->blk_trace;
-       struct blk_io_trace_remap r;
-
-       if (likely(!bt))
-               return;
-
-       r.device = cpu_to_be32(dev);
-       r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-       r.sector = cpu_to_be64(to);
-
-       __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
-/**
- * blk_add_driver_data - Add binary message with driver-specific data
- * @q:         queue the io is for
- * @rq:                io request
- * @data:      driver-specific data
- * @len:       length of driver-specific data
- *
- * Description:
- *     Some drivers might want to write driver-specific data per request.
- *
- **/
-static inline void blk_add_driver_data(struct request_queue *q,
-                                      struct request *rq,
-                                      void *data, size_t len)
-{
-       struct blk_trace *bt = q->blk_trace;
-
-       if (likely(!bt))
-               return;
-
-       if (blk_pc_request(rq))
-               __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-                               rq->errors, len, data);
-       else
-               __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-                               0, BLK_TA_DRV_DATA, rq->errors, len, data);
-}
-
+extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
+                               void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
                           char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
@@ -356,13 +195,8 @@ extern int blk_trace_remove(struct request_queue *q);
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)                (-ENOTTY)
 #define blk_trace_shutdown(q)                  do { } while (0)
-#define blk_add_trace_rq(q, rq, what)          do { } while (0)
-#define blk_add_trace_bio(q, rq, what)         do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)       do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
-#define blk_add_driver_data(q, rq, data, len)  do {} while (0)
 #define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY)
+#define blk_add_driver_data(q, rq, data, len)  do {} while (0)
 #define blk_trace_setup(q, name, dev, arg)     (-ENOTTY)
 #define blk_trace_startstop(q, start)          (-ENOTTY)
 #define blk_trace_remove(q)                    (-ENOTTY)
index ed3a5d473e52b2282c0d09c2c25d406400dc7522..cea153697ec788a3c3d61c2402a3f1188b5b06c7 100644 (file)
@@ -82,13 +82,13 @@ struct clock_event_device {
        int                     shift;
        int                     rating;
        int                     irq;
-       cpumask_t               cpumask;
+       const struct cpumask    *cpumask;
        int                     (*set_next_event)(unsigned long evt,
                                                  struct clock_event_device *);
        void                    (*set_mode)(enum clock_event_mode mode,
                                            struct clock_event_device *);
        void                    (*event_handler)(struct clock_event_device *);
-       void                    (*broadcast)(cpumask_t mask);
+       void                    (*broadcast)(const struct cpumask *mask);
        struct list_head        list;
        enum clock_event_mode   mode;
        ktime_t                 next_event;
index 98115d9d04daa6c8008b528bee3014a8cee11078..ea7c6be354b7d96b636f72e1580a85188b370db6 100644 (file)
@@ -59,8 +59,88 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * specific implementations come from the above header files
  */
 
-#define likely(x)      __builtin_expect(!!(x), 1)
-#define unlikely(x)    __builtin_expect(!!(x), 0)
+struct ftrace_branch_data {
+       const char *func;
+       const char *file;
+       unsigned line;
+       union {
+               struct {
+                       unsigned long correct;
+                       unsigned long incorrect;
+               };
+               struct {
+                       unsigned long miss;
+                       unsigned long hit;
+               };
+       };
+};
+
+/*
+ * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
+ * to disable branch tracing on a per file basis.
+ */
+#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
+
+#define likely_notrace(x)      __builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)    __builtin_expect(!!(x), 0)
+
+#define __branch_check__(x, expect) ({                                 \
+                       int ______r;                                    \
+                       static struct ftrace_branch_data                \
+                               __attribute__((__aligned__(4)))         \
+                               __attribute__((section("_ftrace_annotated_branch"))) \
+                               ______f = {                             \
+                               .func = __func__,                       \
+                               .file = __FILE__,                       \
+                               .line = __LINE__,                       \
+                       };                                              \
+                       ______r = likely_notrace(x);                    \
+                       ftrace_likely_update(&______f, ______r, expect); \
+                       ______r;                                        \
+               })
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same.  This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+#  define likely(x)    (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
+# endif
+# ifndef unlikely
+#  define unlikely(x)  (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
+# endif
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+/*
+ * "Define 'is'", Bill Clinton
+ * "Define 'if'", Steven Rostedt
+ */
+#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) :         \
+       ({                                                              \
+               int ______r;                                            \
+               static struct ftrace_branch_data                        \
+                       __attribute__((__aligned__(4)))                 \
+                       __attribute__((section("_ftrace_branch")))      \
+                       ______f = {                                     \
+                               .func = __func__,                       \
+                               .file = __FILE__,                       \
+                               .line = __LINE__,                       \
+                       };                                              \
+               ______r = !!(cond);                                     \
+               if (______r)                                            \
+                       ______f.hit++;                                  \
+               else                                                    \
+                       ______f.miss++;                                 \
+               ______r;                                                \
+       }))
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+#else
+# define likely(x)     __builtin_expect(!!(x), 1)
+# define unlikely(x)   __builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
index 21e1dd43e52aff50afe75a05d067492407ced320..d4bf52603e6bfbdab95eb81b65bad0dce510356c 100644 (file)
@@ -339,36 +339,6 @@ extern cpumask_t cpu_mask_all;
 #endif
 #define        CPUMASK_PTR(v, m)       cpumask_t *v = &(m->v)
 
-#define cpumask_scnprintf(buf, len, src) \
-                       __cpumask_scnprintf((buf), (len), &(src), NR_CPUS)
-static inline int __cpumask_scnprintf(char *buf, int len,
-                                       const cpumask_t *srcp, int nbits)
-{
-       return bitmap_scnprintf(buf, len, srcp->bits, nbits);
-}
-
-#define cpumask_parse_user(ubuf, ulen, dst) \
-                       __cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS)
-static inline int __cpumask_parse_user(const char __user *buf, int len,
-                                       cpumask_t *dstp, int nbits)
-{
-       return bitmap_parse_user(buf, len, dstp->bits, nbits);
-}
-
-#define cpulist_scnprintf(buf, len, src) \
-                       __cpulist_scnprintf((buf), (len), &(src), NR_CPUS)
-static inline int __cpulist_scnprintf(char *buf, int len,
-                                       const cpumask_t *srcp, int nbits)
-{
-       return bitmap_scnlistprintf(buf, len, srcp->bits, nbits);
-}
-
-#define cpulist_parse(buf, dst) __cpulist_parse((buf), &(dst), NR_CPUS)
-static inline int __cpulist_parse(const char *buf, cpumask_t *dstp, int nbits)
-{
-       return bitmap_parselist(buf, dstp->bits, nbits);
-}
-
 #define cpu_remap(oldbit, old, new) \
                __cpu_remap((oldbit), &(old), &(new), NR_CPUS)
 static inline int __cpu_remap(int oldbit,
@@ -540,9 +510,6 @@ extern cpumask_t cpu_active_map;
        [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \
 }
 
-/* This produces more efficient code. */
-#define nr_cpumask_bits        NR_CPUS
-
 #else /* NR_CPUS > BITS_PER_LONG */
 
 #define CPU_BITS_ALL                                           \
@@ -550,9 +517,15 @@ extern cpumask_t cpu_active_map;
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD         \
 }
+#endif /* NR_CPUS > BITS_PER_LONG */
 
+#ifdef CONFIG_CPUMASK_OFFSTACK
+/* Assuming NR_CPUS is huge, a runtime limit is more efficient.  Also,
+ * not all bits may be allocated. */
 #define nr_cpumask_bits        nr_cpu_ids
-#endif /* NR_CPUS > BITS_PER_LONG */
+#else
+#define nr_cpumask_bits        NR_CPUS
+#endif
 
 /* verify cpu argument to cpumask_* operators */
 static inline unsigned int cpumask_check(unsigned int cpu)
@@ -945,6 +918,63 @@ static inline void cpumask_copy(struct cpumask *dstp,
  */
 #define cpumask_of(cpu) (get_cpu_mask(cpu))
 
+/**
+ * cpumask_scnprintf - print a cpumask into a string as comma-separated hex
+ * @buf: the buffer to sprintf into
+ * @len: the length of the buffer
+ * @srcp: the cpumask to print
+ *
+ * If len is zero, returns zero.  Otherwise returns the length of the
+ * (nul-terminated) @buf string.
+ */
+static inline int cpumask_scnprintf(char *buf, int len,
+                                   const struct cpumask *srcp)
+{
+       return bitmap_scnprintf(buf, len, srcp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpumask_parse_user - extract a cpumask from a user string
+ * @buf: the buffer to extract from
+ * @len: the length of the buffer
+ * @dstp: the cpumask to set.
+ *
+ * Returns -errno, or 0 for success.
+ */
+static inline int cpumask_parse_user(const char __user *buf, int len,
+                                    struct cpumask *dstp)
+{
+       return bitmap_parse_user(buf, len, dstp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpulist_scnprintf - print a cpumask into a string as comma-separated list
+ * @buf: the buffer to sprintf into
+ * @len: the length of the buffer
+ * @srcp: the cpumask to print
+ *
+ * If len is zero, returns zero.  Otherwise returns the length of the
+ * (nul-terminated) @buf string.
+ */
+static inline int cpulist_scnprintf(char *buf, int len,
+                                   const struct cpumask *srcp)
+{
+       return bitmap_scnlistprintf(buf, len, srcp->bits, nr_cpumask_bits);
+}
+
+/**
+ * cpulist_parse_user - extract a cpumask from a user string of ranges
+ * @buf: the buffer to extract from
+ * @len: the length of the buffer
+ * @dstp: the cpumask to set.
+ *
+ * Returns -errno, or 0 for success.
+ */
+static inline int cpulist_parse(const char *buf, struct cpumask *dstp)
+{
+       return bitmap_parselist(buf, dstp->bits, nr_cpumask_bits);
+}
+
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
  * @bitmap: the bitmap
index 4aaa4afb1cb99f849bb345a8189153e65e94cbcd..096476f1fb356a2c17ab5d267be9a2295f55295a 100644 (file)
@@ -17,7 +17,7 @@ extern int debug_locks_off(void);
 ({                                                                     \
        int __ret = 0;                                                  \
                                                                        \
-       if (unlikely(c)) {                                              \
+       if (!oops_in_progress && unlikely(c)) {                         \
                if (debug_locks_off() && !debug_locks_silent)           \
                        WARN_ON(1);                                     \
                __ret = 1;                                              \
index 9c5bc6be2b091274b37b14046f03ebf3db70c194..985b28dc2ba9b3d02bbc8be85597c86cb0b21d05 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/bitops.h>
+#include <linux/sched.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
 
@@ -24,6 +26,45 @@ struct ftrace_ops {
        struct ftrace_ops *next;
 };
 
+extern int function_trace_stop;
+
+/*
+ * Type of the current tracing.
+ */
+enum ftrace_tracing_type_t {
+       FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
+       FTRACE_TYPE_RETURN,     /* Hook the return of the function */
+};
+
+/* Current tracing type, default is FTRACE_TYPE_ENTER */
+extern enum ftrace_tracing_type_t ftrace_tracing_type;
+
+/**
+ * ftrace_stop - stop function tracer.
+ *
+ * A quick way to stop the function tracer. Note this an on off switch,
+ * it is not something that is recursive like preempt_disable.
+ * This does not disable the calling of mcount, it only stops the
+ * calling of functions from mcount.
+ */
+static inline void ftrace_stop(void)
+{
+       function_trace_stop = 1;
+}
+
+/**
+ * ftrace_start - start the function tracer.
+ *
+ * This function is the inverse of ftrace_stop. This does not enable
+ * the function tracing if the function tracer is disabled. This only
+ * sets the function tracer flag to continue calling the functions
+ * from mcount.
+ */
+static inline void ftrace_start(void)
+{
+       function_trace_stop = 0;
+}
+
 /*
  * The ftrace_ops must be a static and should also
  * be read_mostly.  These functions do modify read_mostly variables
@@ -42,9 +83,13 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
 # define unregister_ftrace_function(ops) do { } while (0)
 # define clear_ftrace_function(ops) do { } while (0)
 static inline void ftrace_kill(void) { }
+static inline void ftrace_stop(void) { }
+static inline void ftrace_start(void) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
+/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
+#include <asm/ftrace.h>
 
 enum {
        FTRACE_FL_FREE          = (1 << 0),
@@ -60,6 +105,7 @@ struct dyn_ftrace {
        struct list_head        list;
        unsigned long           ip; /* address of mcount call-site */
        unsigned long           flags;
+       struct dyn_arch_ftrace  arch;
 };
 
 int ftrace_force_update(void);
@@ -67,19 +113,25 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
-extern unsigned char *ftrace_nop_replace(void);
-extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
 extern int ftrace_dyn_arch_init(void *data);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern void ftrace_graph_caller(void);
+extern int ftrace_enable_ftrace_graph_caller(void);
+extern int ftrace_disable_ftrace_graph_caller(void);
+#else
+static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; }
+static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
+#endif
 
 /**
- * ftrace_modify_code - modify code segment
- * @ip: the address of the code segment
- * @old_code: the contents of what is expected to be there
- * @new_code: the code to patch in
+ * ftrace_make_nop - convert code into top
+ * @mod: module structure if called by module load initialization
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should be calling
  *
  * This is a very sensitive operation and great care needs
  * to be taken by the arch.  The operation should carefully
@@ -87,6 +139,8 @@ extern void mcount_call(void);
  * what we expect it to be, and then on success of the compare,
  * it should write to the location.
  *
+ * The code segment at @rec->ip should be a caller to @addr
+ *
  * Return must be:
  *  0 on success
  *  -EFAULT on error reading the location
@@ -94,8 +148,34 @@ extern void mcount_call(void);
  *  -EPERM  on error writing to the location
  * Any other value will be considered a failure.
  */
-extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-                             unsigned char *new_code);
+extern int ftrace_make_nop(struct module *mod,
+                          struct dyn_ftrace *rec, unsigned long addr);
+
+/**
+ * ftrace_make_call - convert a nop call site into a call to addr
+ * @rec: the mcount call site record
+ * @addr: the address that the call site should call
+ *
+ * This is a very sensitive operation and great care needs
+ * to be taken by the arch.  The operation should carefully
+ * read the location, check to see if what is read is indeed
+ * what we expect it to be, and then on success of the compare,
+ * it should write to the location.
+ *
+ * The code segment at @rec->ip should be a nop
+ *
+ * Return must be:
+ *  0 on success
+ *  -EFAULT on error reading the location
+ *  -EINVAL on a failed compare of the contents
+ *  -EPERM  on error writing to the location
+ * Any other value will be considered a failure.
+ */
+extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
+
+
+/* May be defined in arch */
+extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
@@ -103,7 +183,6 @@ extern void ftrace_release(void *start, unsigned long size);
 
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
-
 #else
 # define skip_trace(ip)                                ({ 0; })
 # define ftrace_force_update()                 ({ 0; })
@@ -182,6 +261,12 @@ static inline void __ftrace_enabled_restore(int enabled)
 #endif
 
 #ifdef CONFIG_TRACING
+extern int ftrace_dump_on_oops;
+
+extern void tracing_start(void);
+extern void tracing_stop(void);
+extern void ftrace_off_permanent(void);
+
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
@@ -212,6 +297,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
 ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
 
+static inline void tracing_start(void) { }
+static inline void tracing_stop(void) { }
+static inline void ftrace_off_permanent(void) { }
 static inline int
 ftrace_printk(const char *fmt, ...)
 {
@@ -222,33 +310,167 @@ static inline void ftrace_dump(void) { }
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(unsigned long *start, unsigned long *end);
+extern void ftrace_init_module(struct module *mod,
+                              unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
 static inline void
-ftrace_init_module(unsigned long *start, unsigned long *end) { }
+ftrace_init_module(struct module *mod,
+                  unsigned long *start, unsigned long *end) { }
 #endif
 
+enum {
+       POWER_NONE = 0,
+       POWER_CSTATE = 1,
+       POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+       ktime_t                 stamp;
+       ktime_t                 end;
+       int                     type;
+       int                     state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+                                       unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+                                       unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+                                       unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+                                       unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
+
+/*
+ * Structure that defines an entry function trace.
+ */
+struct ftrace_graph_ent {
+       unsigned long func; /* Current function */
+       int depth;
+};
 
-struct boot_trace {
-       pid_t                   caller;
-       char                    func[KSYM_SYMBOL_LEN];
-       int                     result;
-       unsigned long long      duration;               /* usecs */
-       ktime_t                 calltime;
-       ktime_t                 rettime;
+/*
+ * Structure that defines a return function trace.
+ */
+struct ftrace_graph_ret {
+       unsigned long func; /* Current function */
+       unsigned long long calltime;
+       unsigned long long rettime;
+       /* Number of functions that overran the depth limit for current task */
+       unsigned long overrun;
+       int depth;
 };
 
-#ifdef CONFIG_BOOT_TRACER
-extern void trace_boot(struct boot_trace *it, initcall_t fn);
-extern void start_boot_trace(void);
-extern void stop_boot_trace(void);
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/*
+ * Sometimes we don't want to trace a function with the function
+ * graph tracer but we want them to keep traced by the usual function
+ * tracer if the function graph tracer is not configured.
+ */
+#define __notrace_funcgraph            notrace
+
+#define FTRACE_RETFUNC_DEPTH 50
+#define FTRACE_RETSTACK_ALLOC_SIZE 32
+/* Type of the callback handlers for tracing function graph*/
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+
+extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+                               trace_func_graph_ent_t entryfunc);
+
+extern void ftrace_graph_stop(void);
+
+/* The current handlers in use */
+extern trace_func_graph_ret_t ftrace_graph_return;
+extern trace_func_graph_ent_t ftrace_graph_entry;
+
+extern void unregister_ftrace_graph(void);
+
+extern void ftrace_graph_init_task(struct task_struct *t);
+extern void ftrace_graph_exit_task(struct task_struct *t);
+
+static inline int task_curr_ret_stack(struct task_struct *t)
+{
+       return t->curr_ret_stack;
+}
+
+static inline void pause_graph_tracing(void)
+{
+       atomic_inc(&current->tracing_graph_pause);
+}
+
+static inline void unpause_graph_tracing(void)
+{
+       atomic_dec(&current->tracing_graph_pause);
+}
 #else
-static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
-static inline void start_boot_trace(void) { }
-static inline void stop_boot_trace(void) { }
+
+#define __notrace_funcgraph
+
+static inline void ftrace_graph_init_task(struct task_struct *t) { }
+static inline void ftrace_graph_exit_task(struct task_struct *t) { }
+
+static inline int task_curr_ret_stack(struct task_struct *tsk)
+{
+       return -1;
+}
+
+static inline void pause_graph_tracing(void) { }
+static inline void unpause_graph_tracing(void) { }
 #endif
 
+#ifdef CONFIG_TRACING
+#include <linux/sched.h>
+
+/* flags for current->trace */
+enum {
+       TSK_TRACE_FL_TRACE_BIT  = 0,
+       TSK_TRACE_FL_GRAPH_BIT  = 1,
+};
+enum {
+       TSK_TRACE_FL_TRACE      = 1 << TSK_TRACE_FL_TRACE_BIT,
+       TSK_TRACE_FL_GRAPH      = 1 << TSK_TRACE_FL_GRAPH_BIT,
+};
+
+static inline void set_tsk_trace_trace(struct task_struct *tsk)
+{
+       set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_trace(struct task_struct *tsk)
+{
+       clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_trace(struct task_struct *tsk)
+{
+       return tsk->trace & TSK_TRACE_FL_TRACE;
+}
+
+static inline void set_tsk_trace_graph(struct task_struct *tsk)
+{
+       set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline void clear_tsk_trace_graph(struct task_struct *tsk)
+{
+       clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
+}
+
+static inline int test_tsk_trace_graph(struct task_struct *tsk)
+{
+       return tsk->trace & TSK_TRACE_FL_GRAPH;
+}
 
+#endif /* CONFIG_TRACING */
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
new file mode 100644 (file)
index 0000000..366a054
--- /dev/null
@@ -0,0 +1,13 @@
+#ifndef _LINUX_FTRACE_IRQ_H
+#define _LINUX_FTRACE_IRQ_H
+
+
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+extern void ftrace_nmi_enter(void);
+extern void ftrace_nmi_exit(void);
+#else
+static inline void ftrace_nmi_enter(void) { }
+static inline void ftrace_nmi_exit(void) { }
+#endif
+
+#endif /* _LINUX_FTRACE_IRQ_H */
index 586ab56a3ec3500e601e786e0e858f1980fbc8e5..8f627b9ae2b15f1a6b5e383028b6adbce0034f0e 100644 (file)
@@ -164,6 +164,8 @@ union futex_key {
        } both;
 };
 
+#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
+
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
 extern void exit_pi_state_list(struct task_struct *curr);
index 181006cc94a03ecd29630d80d91762589fc1c316..89a56d79e4c6c4987531a10ad8fed17f3d597bf7 100644 (file)
@@ -4,6 +4,7 @@
 #include <linux/preempt.h>
 #include <linux/smp_lock.h>
 #include <linux/lockdep.h>
+#include <linux/ftrace_irq.h>
 #include <asm/hardirq.h>
 #include <asm/system.h>
 
@@ -161,7 +162,17 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()            do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()             do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()                            \
+       do {                                    \
+               ftrace_nmi_enter();             \
+               lockdep_off();                  \
+               __irq_enter();                  \
+       } while (0)
+#define nmi_exit()                             \
+       do {                                    \
+               __irq_exit();                   \
+               lockdep_on();                   \
+               ftrace_nmi_exit();              \
+       } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
index f58a0cf8929a81fb14025ab8683ab32bf8ab539c..7e85a6e89e417f551d8e966fa59827dab19f3d28 100644 (file)
@@ -14,6 +14,8 @@
 #include <linux/irqflags.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
+#include <linux/irqnr.h>
+
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
 #include <asm/system.h>
@@ -109,13 +111,13 @@ extern void enable_irq(unsigned int irq);
 
 extern cpumask_t irq_default_affinity;
 
-extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
 #else /* CONFIG_SMP */
 
-static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
 {
        return -EINVAL;
 }
index 3dddfa703ebd95ab1253c56397104c2607f98e90..fde5e613201829ace74f2ba384171143906e4e85 100644 (file)
@@ -113,7 +113,8 @@ struct irq_chip {
        void            (*eoi)(unsigned int irq);
 
        void            (*end)(unsigned int irq);
-       void            (*set_affinity)(unsigned int irq, cpumask_t dest);
+       void            (*set_affinity)(unsigned int irq,
+                                       const struct cpumask *dest);
        int             (*retrigger)(unsigned int irq);
        int             (*set_type)(unsigned int irq, unsigned int flow_type);
        int             (*set_wake)(unsigned int irq, unsigned int on);
@@ -129,6 +130,8 @@ struct irq_chip {
        const char      *typename;
 };
 
+struct timer_rand_state;
+struct irq_2_iommu;
 /**
  * struct irq_desc - interrupt descriptor
  * @irq:               interrupt number for this descriptor
@@ -154,6 +157,13 @@ struct irq_chip {
  */
 struct irq_desc {
        unsigned int            irq;
+#ifdef CONFIG_SPARSE_IRQ
+       struct timer_rand_state *timer_rand_state;
+       unsigned int            *kstat_irqs;
+# ifdef CONFIG_INTR_REMAP
+       struct irq_2_iommu      *irq_2_iommu;
+# endif
+#endif
        irq_flow_handler_t      handle_irq;
        struct irq_chip         *chip;
        struct msi_desc         *msi_desc;
@@ -181,12 +191,51 @@ struct irq_desc {
        const char              *name;
 } ____cacheline_internodealigned_in_smp;
 
+extern void early_irq_init(void);
+extern void arch_early_irq_init(void);
+extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
+                                       struct irq_desc *desc, int cpu);
+extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
 
+#ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
 
 static inline struct irq_desc *irq_to_desc(unsigned int irq)
 {
-       return (irq < nr_irqs) ? irq_desc + irq : NULL;
+       return (irq < NR_IRQS) ? irq_desc + irq : NULL;
+}
+static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+       return irq_to_desc(irq);
+}
+
+#else
+
+extern struct irq_desc *irq_to_desc(unsigned int irq);
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+
+# define for_each_irq_desc(irq, desc)          \
+       for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
+# define for_each_irq_desc_reverse(irq, desc)                          \
+       for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
+
+#define kstat_irqs_this_cpu(DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()])
+#define kstat_incr_irqs_this_cpu(irqno, DESC) \
+       ((DESC)->kstat_irqs[smp_processor_id()]++)
+
+#endif
+
+static inline struct irq_desc *
+irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
+{
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+       return irq_to_desc(irq);
+#else
+       return desc;
+#endif
 }
 
 /*
@@ -380,6 +429,11 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 #define get_irq_data(irq)      (irq_to_desc(irq)->handler_data)
 #define get_irq_msi(irq)       (irq_to_desc(irq)->msi_desc)
 
+#define get_irq_desc_chip(desc)                ((desc)->chip)
+#define get_irq_desc_chip_data(desc)   ((desc)->chip_data)
+#define get_irq_desc_data(desc)                ((desc)->handler_data)
+#define get_irq_desc_msi(desc)         ((desc)->msi_desc)
+
 #endif /* CONFIG_GENERIC_HARDIRQS */
 
 #endif /* !CONFIG_S390 */
index 452c280c8115525c2c294bebb3104bd6e06645a9..95d2b74641f5dbe2ecb90bb68d430ab600d61b74 100644 (file)
@@ -1,24 +1,38 @@
 #ifndef _LINUX_IRQNR_H
 #define _LINUX_IRQNR_H
 
+/*
+ * Generic irq_desc iterators:
+ */
+#ifdef __KERNEL__
+
 #ifndef CONFIG_GENERIC_HARDIRQS
 #include <asm/irq.h>
 # define nr_irqs               NR_IRQS
 
 # define for_each_irq_desc(irq, desc)          \
        for (irq = 0; irq < nr_irqs; irq++)
+
+# define for_each_irq_desc_reverse(irq, desc)                          \
+       for (irq = nr_irqs - 1; irq >= 0; irq--)
 #else
+
 extern int nr_irqs;
 
+#ifndef CONFIG_SPARSE_IRQ
+
+struct irq_desc;
 # define for_each_irq_desc(irq, desc)          \
        for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
-
-# define for_each_irq_desc_reverse(irq, desc)                          \
-       for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
-            irq >= 0; irq--, desc--)
+# define for_each_irq_desc_reverse(irq, desc)                          \
+       for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1);        \
+           irq >= 0; irq--, desc--)
+#endif
 #endif
 
-#define for_each_irq_nr(irq)                   \
-       for (irq = 0; irq < nr_irqs; irq++)
+#define for_each_irq_nr(irq)                   \
+       for (irq = 0; irq < nr_irqs; irq++)
+
+#endif /* __KERNEL__ */
 
 #endif
index dc7e0d0a6474448aba71b4c32d2045afc44e240e..269df5a17b30af1b7349c131da05abec8aa95046 100644 (file)
@@ -141,6 +141,15 @@ extern int _cond_resched(void);
                (__x < 0) ? -__x : __x;         \
        })
 
+#ifdef CONFIG_PROVE_LOCKING
+void might_fault(void);
+#else
+static inline void might_fault(void)
+{
+       might_sleep();
+}
+#endif
+
 extern struct atomic_notifier_head panic_notifier_list;
 extern long (*panic_blink)(long time);
 NORET_TYPE void panic(const char * fmt, ...)
@@ -188,6 +197,8 @@ extern unsigned long long memparse(const char *ptr, char **retptr);
 extern int core_kernel_text(unsigned long addr);
 extern int __kernel_text_address(unsigned long addr);
 extern int kernel_text_address(unsigned long addr);
+extern int func_ptr_is_kernel_text(void *ptr);
+
 struct pid;
 extern struct pid *session_of_pgrp(struct pid *pgrp);
 
index 4a145caeee075d3209fa4e0d324dfed7f9a15fe7..4ee4b3d2316ffc78495e7bfa0dd3017f220c05c9 100644 (file)
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
        struct cpu_usage_stat   cpustat;
-       unsigned int irqs[NR_IRQS];
+#ifndef CONFIG_SPARSE_IRQ
+       unsigned int irqs[NR_IRQS];
+#endif
 };
 
 DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
+#ifndef CONFIG_SPARSE_IRQ
+#define kstat_irqs_this_cpu(irq) \
+       (kstat_this_cpu.irqs[irq])
+
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 {
        kstat_this_cpu.irqs[irq]++;
 }
+#endif
+
 
+#ifndef CONFIG_SPARSE_IRQ
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
 }
+#else
+extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+#endif
 
 /*
  * Number of interrupts per specific IRQ source, since bootup
index 17f76fc0517377094cf318a544eb55b2e8eaeb56..adc34f2c6eff9eb9dd3eb5e159cf730f5baba457 100644 (file)
@@ -100,6 +100,10 @@ struct kimage {
 #define KEXEC_TYPE_DEFAULT 0
 #define KEXEC_TYPE_CRASH   1
        unsigned int preserve_context : 1;
+
+#ifdef ARCH_HAS_KIMAGE_ARCH
+       struct kimage_arch arch;
+#endif
 };
 
 
index 29aec6e100203da5e8b0f40ab726360ca08b8e72..8956daf64abda85e35e28017753cfc9b21026e40 100644 (file)
@@ -73,6 +73,8 @@ struct lock_class_key {
        struct lockdep_subclass_key     subkeys[MAX_LOCKDEP_SUBCLASSES];
 };
 
+#define LOCKSTAT_POINTS                4
+
 /*
  * The lock-class itself:
  */
@@ -119,7 +121,8 @@ struct lock_class {
        int                             name_version;
 
 #ifdef CONFIG_LOCK_STAT
-       unsigned long                   contention_point[4];
+       unsigned long                   contention_point[LOCKSTAT_POINTS];
+       unsigned long                   contending_point[LOCKSTAT_POINTS];
 #endif
 };
 
@@ -144,6 +147,7 @@ enum bounce_type {
 
 struct lock_class_stats {
        unsigned long                   contention_point[4];
+       unsigned long                   contending_point[4];
        struct lock_time                read_waittime;
        struct lock_time                write_waittime;
        struct lock_time                read_holdtime;
@@ -165,6 +169,7 @@ struct lockdep_map {
        const char                      *name;
 #ifdef CONFIG_LOCK_STAT
        int                             cpu;
+       unsigned long                   ip;
 #endif
 };
 
@@ -356,7 +361,7 @@ struct lock_class_key { };
 #ifdef CONFIG_LOCK_STAT
 
 extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
-extern void lock_acquired(struct lockdep_map *lock);
+extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);
 
 #define LOCK_CONTENDED(_lock, try, lock)                       \
 do {                                                           \
@@ -364,13 +369,13 @@ do {                                                              \
                lock_contended(&(_lock)->dep_map, _RET_IP_);    \
                lock(_lock);                                    \
        }                                                       \
-       lock_acquired(&(_lock)->dep_map);                       \
+       lock_acquired(&(_lock)->dep_map, _RET_IP_);                     \
 } while (0)
 
 #else /* CONFIG_LOCK_STAT */
 
 #define lock_contended(lockdep_map, ip) do {} while (0)
-#define lock_acquired(lockdep_map) do {} while (0)
+#define lock_acquired(lockdep_map, ip) do {} while (0)
 
 #define LOCK_CONTENDED(_lock, try, lock) \
        lock(_lock)
@@ -481,4 +486,22 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 # define lock_map_release(l)                   do { } while (0)
 #endif
 
+#ifdef CONFIG_PROVE_LOCKING
+# define might_lock(lock)                                              \
+do {                                                                   \
+       typecheck(struct lockdep_map *, &(lock)->dep_map);              \
+       lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_);    \
+       lock_release(&(lock)->dep_map, 0, _THIS_IP_);                   \
+} while (0)
+# define might_lock_read(lock)                                                 \
+do {                                                                   \
+       typecheck(struct lockdep_map *, &(lock)->dep_map);              \
+       lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_);    \
+       lock_release(&(lock)->dep_map, 0, _THIS_IP_);                   \
+} while (0)
+#else
+# define might_lock(lock) do { } while (0)
+# define might_lock_read(lock) do { } while (0)
+#endif
+
 #endif /* __LINUX_LOCKDEP_H */
index 889196c7fbb1e77cc5b4561e2b0b7f937b864434..b85e74ca782ff1fb026732717ffa8db5d688a1d9 100644 (file)
@@ -12,6 +12,7 @@
  * See the file COPYING for more details.
  */
 
+#include <stdarg.h>
 #include <linux/types.h>
 
 struct module;
@@ -48,10 +49,28 @@ struct marker {
        void (*call)(const struct marker *mdata, void *call_private, ...);
        struct marker_probe_closure single;
        struct marker_probe_closure *multi;
+       const char *tp_name;    /* Optional tracepoint name */
+       void *tp_cb;            /* Optional tracepoint callback */
 } __attribute__((aligned(8)));
 
 #ifdef CONFIG_MARKERS
 
+#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format)               \
+               static const char __mstrtab_##name[]                    \
+               __attribute__((section("__markers_strings")))           \
+               = #name "\0" format;                                    \
+               static struct marker __mark_##name                      \
+               __attribute__((section("__markers"), aligned(8))) =     \
+               { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],   \
+                 0, 0, marker_probe_cb, { __mark_empty_function, NULL},\
+                 NULL, tp_name_str, tp_cb }
+
+#define DEFINE_MARKER(name, format)                                    \
+               _DEFINE_MARKER(name, NULL, NULL, format)
+
+#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format)                 \
+               _DEFINE_MARKER(name, #tp_name, tp_cb, format)
+
 /*
  * Note : the empty asm volatile with read constraint is used here instead of a
  * "used" attribute to fix a gcc 4.1.x bug.
@@ -65,14 +84,7 @@ struct marker {
  */
 #define __trace_mark(generic, name, call_private, format, args...)     \
        do {                                                            \
-               static const char __mstrtab_##name[]                    \
-               __attribute__((section("__markers_strings")))           \
-               = #name "\0" format;                                    \
-               static struct marker __mark_##name                      \
-               __attribute__((section("__markers"), aligned(8))) =     \
-               { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)],   \
-               0, 0, marker_probe_cb,                                  \
-               { __mark_empty_function, NULL}, NULL };                 \
+               DEFINE_MARKER(name, format);                            \
                __mark_check_format(format, ## args);                   \
                if (unlikely(__mark_##name.state)) {                    \
                        (*__mark_##name.call)                           \
@@ -80,14 +92,39 @@ struct marker {
                }                                                       \
        } while (0)
 
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+       do {                                                            \
+               void __check_tp_type(void)                              \
+               {                                                       \
+                       register_trace_##tp_name(tp_cb);                \
+               }                                                       \
+               DEFINE_MARKER_TP(name, tp_name, tp_cb, format);         \
+               __mark_check_format(format, ## args);                   \
+               (*__mark_##name.call)(&__mark_##name, call_private,     \
+                                       ## args);                       \
+       } while (0)
+
 extern void marker_update_probe_range(struct marker *begin,
        struct marker *end);
+
+#define GET_MARKER(name)       (__mark_##name)
+
 #else /* !CONFIG_MARKERS */
+#define DEFINE_MARKER(name, tp_name, tp_cb, format)
 #define __trace_mark(generic, name, call_private, format, args...) \
                __mark_check_format(format, ## args)
+#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
+       do {                                                            \
+               void __check_tp_type(void)                              \
+               {                                                       \
+                       register_trace_##tp_name(tp_cb);                \
+               }                                                       \
+               __mark_check_format(format, ## args);                   \
+       } while (0)
 static inline void marker_update_probe_range(struct marker *begin,
        struct marker *end)
 { }
+#define GET_MARKER(name)
 #endif /* CONFIG_MARKERS */
 
 /**
@@ -116,6 +153,20 @@ static inline void marker_update_probe_range(struct marker *begin,
 #define _trace_mark(name, format, args...) \
        __trace_mark(1, name, NULL, format, ## args)
 
+/**
+ * trace_mark_tp - Marker in a tracepoint callback
+ * @name: marker name, not quoted.
+ * @tp_name: tracepoint name, not quoted.
+ * @tp_cb: tracepoint callback. Should have an associated global symbol so it
+ *         is not optimized away by the compiler (should not be static).
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker in a tracepoint callback.
+ */
+#define trace_mark_tp(name, tp_name, tp_cb, format, args...)   \
+       __trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
+
 /**
  * MARK_NOARGS - Format string for a marker with no argument.
  */
@@ -136,8 +187,6 @@ extern marker_probe_func __mark_empty_function;
 
 extern void marker_probe_cb(const struct marker *mdata,
        void *call_private, ...);
-extern void marker_probe_cb_noarg(const struct marker *mdata,
-       void *call_private, ...);
 
 /*
  * Connect a probe to a marker.
@@ -162,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
 
 /*
  * marker_synchronize_unregister must be called between the last marker probe
- * unregistration and the end of module exit to make sure there is no caller
- * executing a probe when it is freed.
+ * unregistration and the first one of
+ * - the end of module exit function
+ * - the free of any resource used by the probes
+ * to ensure the code and data are valid for any possibly running probes.
  */
 #define marker_synchronize_unregister() synchronize_sched()
 
index 8f293922720735ee164cf4b5183f064d09e9f5a6..d2b8a1e8ca11000cf622717ab3302bf9a64888c3 100644 (file)
@@ -10,8 +10,11 @@ struct msi_msg {
 };
 
 /* Helper functions */
+struct irq_desc;
 extern void mask_msi_irq(unsigned int irq);
 extern void unmask_msi_irq(unsigned int irq);
+extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
+extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
 extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
 extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
index bc6da10ceee002ef14850e8025839e67d0d87c48..7a0e5c4f8072c53f4b0d6a7dfa681355324e065f 100644 (file)
@@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 /*
  * NOTE: mutex_trylock() follows the spin_trylock() convention,
  *       not the down_trylock() convention!
+ *
+ * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
index d7e98ff8021eb3bfc38e4c851b425c7eac5c097f..bb206c56d1f0eb16877a0c664712edae4de9df1d 100644 (file)
@@ -147,9 +147,9 @@ pid_t pid_vnr(struct pid *pid);
 #define do_each_pid_task(pid, type, task)                              \
        do {                                                            \
                struct hlist_node *pos___;                              \
-               if (pid != NULL)                                        \
+               if ((pid) != NULL)                                      \
                        hlist_for_each_entry_rcu((task), pos___,        \
-                               &pid->tasks[type], pids[type].node) {
+                               &(pid)->tasks[type], pids[type].node) {
 
                        /*
                         * Both old and new leaders may be attached to
index 36f125c0c6037f0270941e96e63790151a66fe34..adbf3bd3c6b3c8edea3e313a4abce0a736d02ebf 100644 (file)
@@ -8,6 +8,7 @@
 #define _LINUX_RANDOM_H
 
 #include <linux/ioctl.h>
+#include <linux/irqnr.h>
 
 /* ioctl()'s for the random number generator */
 
@@ -44,6 +45,56 @@ struct rand_pool_info {
 
 extern void rand_initialize_irq(int irq);
 
+struct timer_rand_state;
+#ifndef CONFIG_SPARSE_IRQ
+
+extern struct timer_rand_state *irq_timer_state[];
+
+static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+       if (irq >= nr_irqs)
+               return NULL;
+
+       return irq_timer_state[irq];
+}
+
+static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+       if (irq >= nr_irqs)
+               return;
+
+       irq_timer_state[irq] = state;
+}
+
+#else
+
+#include <linux/irq.h>
+static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
+{
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+
+       if (!desc)
+               return NULL;
+
+       return desc->timer_rand_state;
+}
+
+static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
+{
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+
+       if (!desc)
+               return;
+
+       desc->timer_rand_state = state;
+}
+#endif
+
+
 extern void add_input_randomness(unsigned int type, unsigned int code,
                                 unsigned int value);
 extern void add_interrupt_randomness(int irq);
index 5f89b62e6983192befd7cc827df50fb53cd0a173..301dda829e37499f5cdaa0950e3a11f0c4ed3b74 100644 (file)
@@ -41,7 +41,7 @@
 #include <linux/seqlock.h>
 
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-#define RCU_SECONDS_TILL_STALL_CHECK   ( 3 * HZ) /* for rcp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_CHECK   (10 * HZ) /* for rcp->jiffies_stall */
 #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
index 86f1f5e43e333766ec6a9fe5276875046c2f2526..895dc9c1088c767ce4706814b806c20248423241 100644 (file)
@@ -142,6 +142,7 @@ struct rcu_head {
  * on the write-side to insure proper synchronization.
  */
 #define rcu_read_lock_sched() preempt_disable()
+#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
 
 /*
  * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
@@ -149,6 +150,7 @@ struct rcu_head {
  * See rcu_read_lock_sched for more information.
  */
 #define rcu_read_unlock_sched() preempt_enable()
+#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
 
 
 
index e097c2e6b6dcaa212f26bf6b08b66e4c9021cb21..d363467c8f13e31a7c1ff4eaa92ee0dd961d67d6 100644 (file)
@@ -28,17 +28,19 @@ struct ring_buffer_event {
  *                              size = 8 bytes
  *
  * @RINGBUF_TYPE_TIME_STAMP:   Sync time stamp with external clock
- *                              array[0] = tv_nsec
- *                              array[1] = tv_sec
+ *                              array[0]    = tv_nsec
+ *                              array[1..2] = tv_sec
  *                              size = 16 bytes
  *
  * @RINGBUF_TYPE_DATA:         Data record
  *                              If len is zero:
  *                               array[0] holds the actual length
- *                               array[1..(length+3)/4-1] holds data
+ *                               array[1..(length+3)/4] holds data
+ *                               size = 4 + 4 + length (bytes)
  *                              else
  *                               length = len << 2
- *                               array[0..(length+3)/4] holds data
+ *                               array[0..(length+3)/4-1] holds data
+ *                               size = 4 + length (bytes)
  */
 enum ring_buffer_type {
        RINGBUF_TYPE_PADDING,
@@ -122,6 +124,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
 void tracing_on(void);
 void tracing_off(void);
+void tracing_off_permanent(void);
+
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
+int ring_buffer_read_page(struct ring_buffer *buffer,
+                         void **data_page, int cpu, int full);
 
 enum ring_buffer_flags {
        RB_FL_OVERWRITE         = 1 << 0,
index 55e30d11447790dd433d0e874285ced3a5499328..4240f6bfa812485712f8fb726299cf7f158fd910 100644 (file)
@@ -96,6 +96,7 @@ struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
+struct bts_tracer;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -249,7 +250,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 extern int runqueue_is_locked(void);
 extern void task_rq_unlock_wait(struct task_struct *p);
 
-extern cpumask_t nohz_cpu_mask;
+extern cpumask_var_t nohz_cpu_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
 extern int select_nohz_load_balancer(int cpu);
 #else
@@ -259,8 +260,6 @@ static inline int select_nohz_load_balancer(int cpu)
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
@@ -777,7 +776,6 @@ enum cpu_idle_type {
 
 struct sched_group {
        struct sched_group *next;       /* Must be a circular list */
-       cpumask_t cpumask;
 
        /*
         * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -790,8 +788,15 @@ struct sched_group {
         * (see include/linux/reciprocal_div.h)
         */
        u32 reciprocal_cpu_power;
+
+       unsigned long cpumask[];
 };
 
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+       return to_cpumask(sg->cpumask);
+}
+
 enum sched_domain_level {
        SD_LV_NONE = 0,
        SD_LV_SIBLING,
@@ -815,7 +820,6 @@ struct sched_domain {
        struct sched_domain *parent;    /* top domain must be null terminated */
        struct sched_domain *child;     /* bottom domain must be null terminated */
        struct sched_group *groups;     /* the balancing groups of the domain */
-       cpumask_t span;                 /* span of all CPUs in this domain */
        unsigned long min_interval;     /* Minimum balance interval ms */
        unsigned long max_interval;     /* Maximum balance interval ms */
        unsigned int busy_factor;       /* less balancing by factor if busy */
@@ -870,9 +874,17 @@ struct sched_domain {
 #ifdef CONFIG_SCHED_DEBUG
        char *name;
 #endif
+
+       /* span of all CPUs in this domain */
+       unsigned long span[];
 };
 
-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+{
+       return to_cpumask(sd->span);
+}
+
+extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);
 
@@ -881,7 +893,7 @@ extern int arch_reinit_sched_domains(void);
 struct sched_domain_attr;
 
 static inline void
-partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                        struct sched_domain_attr *dattr_new)
 {
 }
@@ -963,7 +975,7 @@ struct sched_class {
        void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 
        void (*set_cpus_allowed)(struct task_struct *p,
-                                const cpumask_t *newmask);
+                                const struct cpumask *newmask);
 
        void (*rq_online)(struct rq *rq);
        void (*rq_offline)(struct rq *rq);
@@ -1165,6 +1177,18 @@ struct task_struct {
        struct list_head ptraced;
        struct list_head ptrace_entry;
 
+#ifdef CONFIG_X86_PTRACE_BTS
+       /*
+        * This is the tracer handle for the ptrace BTS extension.
+        * This field actually belongs to the ptracer task.
+        */
+       struct bts_tracer *bts;
+       /*
+        * The buffer to hold the BTS data.
+        */
+       void *bts_buffer;
+#endif /* CONFIG_X86_PTRACE_BTS */
+
        /* PID/PID hash table linkage. */
        struct pid_link pids[PIDTYPE_MAX];
        struct list_head thread_group;
@@ -1356,6 +1380,23 @@ struct task_struct {
        unsigned long default_timer_slack_ns;
 
        struct list_head        *scm_work_list;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       /* Index of current stored adress in ret_stack */
+       int curr_ret_stack;
+       /* Stack of return addresses for return function tracing */
+       struct ftrace_ret_stack *ret_stack;
+       /*
+        * Number of functions that haven't been traced
+        * because of depth overrun.
+        */
+       atomic_t trace_overrun;
+       /* Pause for the tracing */
+       atomic_t tracing_graph_pause;
+#endif
+#ifdef CONFIG_TRACING
+       /* state flags for use by tracers */
+       unsigned long trace;
+#endif
 };
 
 /*
@@ -1594,12 +1635,12 @@ extern cputime_t task_gtime(struct task_struct *p);
 
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed_ptr(struct task_struct *p,
-                               const cpumask_t *new_mask);
+                               const struct cpumask *new_mask);
 #else
 static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                                      const cpumask_t *new_mask)
+                                      const struct cpumask *new_mask)
 {
-       if (!cpu_isset(0, *new_mask))
+       if (!cpumask_test_cpu(0, new_mask))
                return -EINVAL;
        return 0;
 }
@@ -2212,8 +2253,8 @@ __trace_special(void *__tr, void *__data,
 }
 #endif
 
-extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
-extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
 extern int sched_mc_power_savings, sched_smt_power_savings;
 
@@ -2224,6 +2265,7 @@ extern void normalize_rt_tasks(void);
 extern struct task_group init_task_group;
 #ifdef CONFIG_USER_SCHED
 extern struct task_group root_task_group;
+extern void set_tg_uid(struct user_struct *user);
 #endif
 
 extern struct task_group *sched_create_group(struct task_group *parent);
index dc50bcc282a888ba1611a5485c53467d4a2ab953..b3dfa72f13b902bd48275ac923b551f0375d7e0f 100644 (file)
@@ -34,6 +34,7 @@ struct seq_operations {
 
 #define SEQ_SKIP 1
 
+char *mangle_path(char *s, char *p, char *esc);
 int seq_open(struct file *, const struct seq_operations *);
 ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
 loff_t seq_lseek(struct file *, loff_t, int);
index b106fd8e0d5c4298d6495e1d48cefdd81d21910d..1a8cecc4f38cd2b91a90f267a54fd39bb4de196b 100644 (file)
@@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
                                struct stack_trace *trace);
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
+
+#ifdef CONFIG_USER_STACKTRACE_SUPPORT
+extern void save_stack_trace_user(struct stack_trace *trace);
+#else
+# define save_stack_trace_user(trace)              do { } while (0)
+#endif
+
 #else
 # define save_stack_trace(trace)                       do { } while (0)
 # define save_stack_trace_tsk(tsk, trace)              do { } while (0)
+# define save_stack_trace_user(trace)                  do { } while (0)
 # define print_stack_trace(trace, spaces)              do { } while (0)
 #endif
 
index 117f1b7405cfc92ff1c342d2d92a9f35c2e9cbb5..0c5b5ac36d8edc06bb6eb68975f8223e50631c05 100644 (file)
@@ -49,7 +49,7 @@
        for_each_online_node(node)                      \
                if (nr_cpus_node(node))
 
-void arch_update_cpu_topology(void);
+int arch_update_cpu_topology(void);
 
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE         10
index c5bb39c7a7703cdb7db04faf0e2b0ce2086aa19d..757005458366edc2ae54939f2798d35709f48602 100644 (file)
@@ -24,8 +24,12 @@ struct tracepoint {
        const char *name;               /* Tracepoint name */
        int state;                      /* State. */
        void **funcs;
-} __attribute__((aligned(8)));
-
+} __attribute__((aligned(32)));                /*
+                                        * Aligned on 32 bytes because it is
+                                        * globally visible and gcc happily
+                                        * align these on the structure size.
+                                        * Keep in sync with vmlinux.lds.h.
+                                        */
 
 #define TPPROTO(args...)       args
 #define TPARGS(args...)                args
@@ -40,14 +44,14 @@ struct tracepoint {
        do {                                                            \
                void **it_func;                                         \
                                                                        \
-               rcu_read_lock_sched();                                  \
+               rcu_read_lock_sched_notrace();                          \
                it_func = rcu_dereference((tp)->funcs);                 \
                if (it_func) {                                          \
                        do {                                            \
                                ((void(*)(proto))(*it_func))(args);     \
                        } while (*(++it_func));                         \
                }                                                       \
-               rcu_read_unlock_sched();                                \
+               rcu_read_unlock_sched_notrace();                        \
        } while (0)
 
 /*
@@ -55,35 +59,40 @@ struct tracepoint {
  * not add unwanted padding between the beginning of the section and the
  * structure. Force alignment to the same alignment as the section start.
  */
-#define DEFINE_TRACE(name, proto, args)                                        \
+#define DECLARE_TRACE(name, proto, args)                               \
+       extern struct tracepoint __tracepoint_##name;                   \
        static inline void trace_##name(proto)                          \
        {                                                               \
-               static const char __tpstrtab_##name[]                   \
-               __attribute__((section("__tracepoints_strings")))       \
-               = #name ":" #proto;                                     \
-               static struct tracepoint __tracepoint_##name            \
-               __attribute__((section("__tracepoints"), aligned(8))) = \
-               { __tpstrtab_##name, 0, NULL };                         \
                if (unlikely(__tracepoint_##name.state))                \
                        __DO_TRACE(&__tracepoint_##name,                \
                                TPPROTO(proto), TPARGS(args));          \
        }                                                               \
        static inline int register_trace_##name(void (*probe)(proto))   \
        {                                                               \
-               return tracepoint_probe_register(#name ":" #proto,      \
-                       (void *)probe);                                 \
+               return tracepoint_probe_register(#name, (void *)probe); \
        }                                                               \
-       static inline void unregister_trace_##name(void (*probe)(proto))\
+       static inline int unregister_trace_##name(void (*probe)(proto)) \
        {                                                               \
-               tracepoint_probe_unregister(#name ":" #proto,           \
-                       (void *)probe);                                 \
+               return tracepoint_probe_unregister(#name, (void *)probe);\
        }
 
+#define DEFINE_TRACE(name)                                             \
+       static const char __tpstrtab_##name[]                           \
+       __attribute__((section("__tracepoints_strings"))) = #name;      \
+       struct tracepoint __tracepoint_##name                           \
+       __attribute__((section("__tracepoints"), aligned(32))) =        \
+               { __tpstrtab_##name, 0, NULL }
+
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)                             \
+       EXPORT_SYMBOL_GPL(__tracepoint_##name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)                                 \
+       EXPORT_SYMBOL(__tracepoint_##name)
+
 extern void tracepoint_update_probe_range(struct tracepoint *begin,
        struct tracepoint *end);
 
 #else /* !CONFIG_TRACEPOINTS */
-#define DEFINE_TRACE(name, proto, args)                        \
+#define DECLARE_TRACE(name, proto, args)                               \
        static inline void _do_trace_##name(struct tracepoint *tp, proto) \
        { }                                                             \
        static inline void trace_##name(proto)                          \
@@ -92,8 +101,14 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
        {                                                               \
                return -ENOSYS;                                         \
        }                                                               \
-       static inline void unregister_trace_##name(void (*probe)(proto))\
-       { }
+       static inline int unregister_trace_##name(void (*probe)(proto)) \
+       {                                                               \
+               return -ENOSYS;                                         \
+       }
+
+#define DEFINE_TRACE(name)
+#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
+#define EXPORT_TRACEPOINT_SYMBOL(name)
 
 static inline void tracepoint_update_probe_range(struct tracepoint *begin,
        struct tracepoint *end)
@@ -112,6 +127,10 @@ extern int tracepoint_probe_register(const char *name, void *probe);
  */
 extern int tracepoint_probe_unregister(const char *name, void *probe);
 
+extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
+extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
+extern void tracepoint_probe_update_all(void);
+
 struct tracepoint_iter {
        struct module *module;
        struct tracepoint *tracepoint;
index 3b8121d4e36ff90aff5c8f7fa9fe3865be0baacc..eaec37c9d83d0a8dbbb87f7f46f1e0557ce00394 100644 (file)
@@ -325,7 +325,7 @@ extern struct class *tty_class;
  *     go away
  */
 
-extern inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
+static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
 {
        if (tty)
                kref_get(&tty->kref);
index fec6decfb983503ec8ae24bafa793f09b5f902b2..6b58367d145e8735eaebbf16bfec0733406389fb 100644 (file)
@@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to,
                                                        \
                set_fs(KERNEL_DS);                      \
                pagefault_disable();                    \
-               ret = __get_user(retval, (__force typeof(retval) __user *)(addr));              \
+               ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval));            \
                pagefault_enable();                     \
                set_fs(old_fs);                         \
                ret;                                    \
diff --git a/include/trace/block.h b/include/trace/block.h
new file mode 100644 (file)
index 0000000..25c6a1f
--- /dev/null
@@ -0,0 +1,76 @@
+#ifndef _TRACE_BLOCK_H
+#define _TRACE_BLOCK_H
+
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(block_rq_abort,
+       TPPROTO(struct request_queue *q, struct request *rq),
+               TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_insert,
+       TPPROTO(struct request_queue *q, struct request *rq),
+               TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_issue,
+       TPPROTO(struct request_queue *q, struct request *rq),
+               TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_requeue,
+       TPPROTO(struct request_queue *q, struct request *rq),
+               TPARGS(q, rq));
+
+DECLARE_TRACE(block_rq_complete,
+       TPPROTO(struct request_queue *q, struct request *rq),
+               TPARGS(q, rq));
+
+DECLARE_TRACE(block_bio_bounce,
+       TPPROTO(struct request_queue *q, struct bio *bio),
+               TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_complete,
+       TPPROTO(struct request_queue *q, struct bio *bio),
+               TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_backmerge,
+       TPPROTO(struct request_queue *q, struct bio *bio),
+               TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_frontmerge,
+       TPPROTO(struct request_queue *q, struct bio *bio),
+               TPARGS(q, bio));
+
+DECLARE_TRACE(block_bio_queue,
+       TPPROTO(struct request_queue *q, struct bio *bio),
+               TPARGS(q, bio));
+
+DECLARE_TRACE(block_getrq,
+       TPPROTO(struct request_queue *q, struct bio *bio, int rw),
+               TPARGS(q, bio, rw));
+
+DECLARE_TRACE(block_sleeprq,
+       TPPROTO(struct request_queue *q, struct bio *bio, int rw),
+               TPARGS(q, bio, rw));
+
+DECLARE_TRACE(block_plug,
+       TPPROTO(struct request_queue *q),
+               TPARGS(q));
+
+DECLARE_TRACE(block_unplug_timer,
+       TPPROTO(struct request_queue *q),
+               TPARGS(q));
+
+DECLARE_TRACE(block_unplug_io,
+       TPPROTO(struct request_queue *q),
+               TPARGS(q));
+
+DECLARE_TRACE(block_split,
+       TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
+               TPARGS(q, bio, pdu));
+
+DECLARE_TRACE(block_remap,
+       TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+               sector_t from, sector_t to),
+               TPARGS(q, bio, dev, from, to));
+
+#endif
diff --git a/include/trace/boot.h b/include/trace/boot.h
new file mode 100644 (file)
index 0000000..088ea08
--- /dev/null
@@ -0,0 +1,60 @@
+#ifndef _LINUX_TRACE_BOOT_H
+#define _LINUX_TRACE_BOOT_H
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/init.h>
+
+/*
+ * Structure which defines the trace of an initcall
+ * while it is called.
+ * You don't have to fill the func field since it is
+ * only used internally by the tracer.
+ */
+struct boot_trace_call {
+       pid_t                   caller;
+       char                    func[KSYM_SYMBOL_LEN];
+};
+
+/*
+ * Structure which defines the trace of an initcall
+ * while it returns.
+ */
+struct boot_trace_ret {
+       char                    func[KSYM_SYMBOL_LEN];
+       int                             result;
+       unsigned long long      duration;               /* nsecs */
+};
+
+#ifdef CONFIG_BOOT_TRACER
+/* Append the traces on the ring-buffer */
+extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn);
+extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn);
+
+/* Tells the tracer that smp_pre_initcall is finished.
+ * So we can start the tracing
+ */
+extern void start_boot_trace(void);
+
+/* Resume the tracing of other necessary events
+ * such as sched switches
+ */
+extern void enable_boot_trace(void);
+
+/* Suspend this tracing. Actually, only sched_switches tracing have
+ * to be suspended. Initcalls doesn't need it.)
+ */
+extern void disable_boot_trace(void);
+#else
+static inline
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { }
+
+static inline
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { }
+
+static inline void start_boot_trace(void) { }
+static inline void enable_boot_trace(void) { }
+static inline void disable_boot_trace(void) { }
+#endif /* CONFIG_BOOT_TRACER */
+
+#endif /* __LINUX_TRACE_BOOT_H */
index ad47369d01b5957fbc22322482c530ef28e25a33..9b2854abf7e2fcb732452cfdc03261aa5edfe478 100644 (file)
@@ -4,52 +4,52 @@
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(sched_kthread_stop,
+DECLARE_TRACE(sched_kthread_stop,
        TPPROTO(struct task_struct *t),
                TPARGS(t));
 
-DEFINE_TRACE(sched_kthread_stop_ret,
+DECLARE_TRACE(sched_kthread_stop_ret,
        TPPROTO(int ret),
                TPARGS(ret));
 
-DEFINE_TRACE(sched_wait_task,
+DECLARE_TRACE(sched_wait_task,
        TPPROTO(struct rq *rq, struct task_struct *p),
                TPARGS(rq, p));
 
-DEFINE_TRACE(sched_wakeup,
+DECLARE_TRACE(sched_wakeup,
        TPPROTO(struct rq *rq, struct task_struct *p),
                TPARGS(rq, p));
 
-DEFINE_TRACE(sched_wakeup_new,
+DECLARE_TRACE(sched_wakeup_new,
        TPPROTO(struct rq *rq, struct task_struct *p),
                TPARGS(rq, p));
 
-DEFINE_TRACE(sched_switch,
+DECLARE_TRACE(sched_switch,
        TPPROTO(struct rq *rq, struct task_struct *prev,
                struct task_struct *next),
                TPARGS(rq, prev, next));
 
-DEFINE_TRACE(sched_migrate_task,
+DECLARE_TRACE(sched_migrate_task,
        TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
                TPARGS(rq, p, dest_cpu));
 
-DEFINE_TRACE(sched_process_free,
+DECLARE_TRACE(sched_process_free,
        TPPROTO(struct task_struct *p),
                TPARGS(p));
 
-DEFINE_TRACE(sched_process_exit,
+DECLARE_TRACE(sched_process_exit,
        TPPROTO(struct task_struct *p),
                TPARGS(p));
 
-DEFINE_TRACE(sched_process_wait,
+DECLARE_TRACE(sched_process_wait,
        TPPROTO(struct pid *pid),
                TPARGS(pid));
 
-DEFINE_TRACE(sched_process_fork,
+DECLARE_TRACE(sched_process_fork,
        TPPROTO(struct task_struct *parent, struct task_struct *child),
                TPARGS(parent, child));
 
-DEFINE_TRACE(sched_signal_send,
+DECLARE_TRACE(sched_signal_send,
        TPPROTO(int sig, struct task_struct *p),
                TPARGS(sig, p));
 
index f763762d544a135a0a06f67f36701b9f0d331140..b3782c6d5ede57f94446de710213a2b20deed283 100644 (file)
@@ -808,6 +808,7 @@ config TRACEPOINTS
 
 config MARKERS
        bool "Activate markers"
+       depends on TRACEPOINTS
        help
          Place an empty function call at each marker site. Can be
          dynamically changed for a probe function.
@@ -916,6 +917,15 @@ config KMOD
 
 endif # MODULES
 
+config INIT_ALL_POSSIBLE
+       bool
+       help
+         Back when each arch used to define their own cpu_online_map and
+         cpu_possible_map, some of them chose to initialize cpu_possible_map
+         with all 1s, and others with all 0s.  When they were centralised,
+         it was better to provide this option than to break all the archs
+         and have several arch maintainers persuing me down dark alleys.
+
 config STOP_MACHINE
        bool
        default y
index 7e117a231af10313f1b9bd963bf404eecaf94c9e..9d761aa53296b209243abdf005b6bf1f62ab86b9 100644 (file)
@@ -63,6 +63,7 @@
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -539,6 +540,15 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
+void __init __weak arch_early_irq_init(void)
+{
+}
+
+void __init __weak early_irq_init(void)
+{
+       arch_early_irq_init();
+}
+
 asmlinkage void __init start_kernel(void)
 {
        char * command_line;
@@ -603,6 +613,8 @@ asmlinkage void __init start_kernel(void)
        sort_main_extable();
        trap_init();
        rcu_init();
+       /* init some links before init_ISA_irqs() */
+       early_irq_init();
        init_IRQ();
        pidhash_init();
        init_timers();
@@ -703,31 +715,35 @@ core_param(initcall_debug, initcall_debug, bool, 0644);
 int do_one_initcall(initcall_t fn)
 {
        int count = preempt_count();
-       ktime_t delta;
+       ktime_t calltime, delta, rettime;
        char msgbuf[64];
-       struct boot_trace it;
+       struct boot_trace_call call;
+       struct boot_trace_ret ret;
 
        if (initcall_debug) {
-               it.caller = task_pid_nr(current);
-               printk("calling  %pF @ %i\n", fn, it.caller);
-               it.calltime = ktime_get();
+               call.caller = task_pid_nr(current);
+               printk("calling  %pF @ %i\n", fn, call.caller);
+               calltime = ktime_get();
+               trace_boot_call(&call, fn);
+               enable_boot_trace();
        }
 
-       it.result = fn();
+       ret.result = fn();
 
        if (initcall_debug) {
-               it.rettime = ktime_get();
-               delta = ktime_sub(it.rettime, it.calltime);
-               it.duration = (unsigned long long) delta.tv64 >> 10;
+               disable_boot_trace();
+               rettime = ktime_get();
+               delta = ktime_sub(rettime, calltime);
+               ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10;
+               trace_boot_ret(&ret, fn);
                printk("initcall %pF returned %d after %Ld usecs\n", fn,
-                       it.result, it.duration);
-               trace_boot(&it, fn);
+                       ret.result, ret.duration);
        }
 
        msgbuf[0] = 0;
 
-       if (it.result && it.result != -ENODEV && initcall_debug)
-               sprintf(msgbuf, "error code %d ", it.result);
+       if (ret.result && ret.result != -ENODEV && initcall_debug)
+               sprintf(msgbuf, "error code %d ", ret.result);
 
        if (preempt_count() != count) {
                strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@ -741,7 +757,7 @@ int do_one_initcall(initcall_t fn)
                printk("initcall %pF returned with %s\n", fn, msgbuf);
        }
 
-       return it.result;
+       return ret.result;
 }
 
 
@@ -882,7 +898,7 @@ static int __init kernel_init(void * unused)
         * we're essentially up and running. Get rid of the
         * initmem segments and start the user-mode stuff..
         */
-       stop_boot_trace();
+
        init_post();
        return 0;
 }
index 19fad003b19d6ac0752597f5a23e18341d1d579a..6a212b842d86b9c4f7afd40cf4af6c2f8269518b 100644 (file)
@@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -pg
 endif
 
 obj-$(CONFIG_FREEZER) += freezer.o
@@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 
-ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
index 8ea32e8d68b05c95b211830deee92d9a7929637b..bae131a1211b55a9051bfe581dd74df8e72dc248 100644 (file)
 cpumask_t cpu_present_map __read_mostly;
 EXPORT_SYMBOL(cpu_present_map);
 
-#ifndef CONFIG_SMP
-
 /*
  * Represents all cpu's that are currently online.
  */
-cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
+#ifdef CONFIG_INIT_ALL_POSSIBLE
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+#else
+cpumask_t cpu_possible_map __read_mostly;
+#endif
 EXPORT_SYMBOL(cpu_possible_map);
 
-#else /* CONFIG_SMP */
-
+#ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
index 96c0ba13b8cd06add25de5043547b62cef6ef7b5..39c1a4c1c5a926a58a8ba132f684b97ebdf060bf 100644 (file)
@@ -896,7 +896,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
        if (!*buf) {
                cpus_clear(trialcs.cpus_allowed);
        } else {
-               retval = cpulist_parse(buf, trialcs.cpus_allowed);
+               retval = cpulist_parse(buf, &trialcs.cpus_allowed);
                if (retval < 0)
                        return retval;
 
@@ -1482,7 +1482,7 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
        mask = cs->cpus_allowed;
        mutex_unlock(&callback_mutex);
 
-       return cpulist_scnprintf(page, PAGE_SIZE, mask);
+       return cpulist_scnprintf(page, PAGE_SIZE, &mask);
 }
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
index 2d8be7ebb0f73499f894a1828fd827f0217290f1..61ba5b4b10cfb8a16e2d5b7010cbef3287744d38 100644 (file)
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
+DEFINE_TRACE(sched_process_free);
+DEFINE_TRACE(sched_process_exit);
+DEFINE_TRACE(sched_process_wait);
+
 static void exit_mm(struct task_struct * tsk);
 
 static inline int task_detached(struct task_struct *p)
@@ -1123,7 +1127,6 @@ NORET_TYPE void do_exit(long code)
        preempt_disable();
        /* causes final put_task_struct in finish_task_switch(). */
        tsk->state = TASK_DEAD;
-
        schedule();
        BUG();
        /* Avoid "noreturn function does return".  */
@@ -1321,10 +1324,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
                 * group, which consolidates times for all threads in the
                 * group including the group leader.
                 */
+               thread_group_cputime(p, &cputime);
                spin_lock_irq(&p->parent->sighand->siglock);
                psig = p->parent->signal;
                sig = p->signal;
-               thread_group_cputime(p, &cputime);
                psig->cutime =
                        cputime_add(psig->cutime,
                        cputime_add(cputime.utime,
index a26cb2e170237d849c9f87038a6def48ee948dd6..e136ed8d82ba56ab81283a48fe3d6cfb41c23c79 100644 (file)
@@ -17,6 +17,7 @@
 */
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/ftrace.h>
 #include <asm/uaccess.h>
 #include <asm/sections.h>
 
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
        return e;
 }
 
-int core_kernel_text(unsigned long addr)
+__notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
        if (addr >= (unsigned long)_stext &&
            addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
        return 0;
 }
 
-int __kernel_text_address(unsigned long addr)
+__notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
        if (core_kernel_text(addr))
                return 1;
@@ -66,3 +67,19 @@ int kernel_text_address(unsigned long addr)
                return 1;
        return module_text_address(addr) != NULL;
 }
+
+/*
+ * On some architectures (PPC64, IA64) function pointers
+ * are actually only tokens to some data that then holds the
+ * real function address. As a result, to find if a function
+ * pointer is part of the kernel text, we need to do some
+ * special dereferencing first.
+ */
+int func_ptr_is_kernel_text(void *ptr)
+{
+       unsigned long addr;
+       addr = (unsigned long) dereference_function_descriptor(ptr);
+       if (core_kernel_text(addr))
+               return 1;
+       return module_text_address(addr) != NULL;
+}
index 495da2e9a8b4d92a1b464b98196228f818c892e7..7b93da72d4a2331a69438e4ac347a477a70aadcd 100644 (file)
@@ -47,6 +47,7 @@
 #include <linux/mount.h>
 #include <linux/audit.h>
 #include <linux/memcontrol.h>
+#include <linux/ftrace.h>
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
@@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
+DEFINE_TRACE(sched_process_fork);
+
 int nr_processes(void)
 {
        int cpu;
@@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk)
        prop_local_destroy_single(&tsk->dirties);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
+       ftrace_graph_exit_task(tsk);
        free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -1136,6 +1140,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                }
        }
 
+       ftrace_graph_init_task(p);
+
        p->pid = pid_nr(pid);
        p->tgid = p->pid;
        if (clone_flags & CLONE_THREAD)
@@ -1144,7 +1150,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (current->nsproxy != p->nsproxy) {
                retval = ns_cgroup_clone(p, pid);
                if (retval)
-                       goto bad_fork_free_pid;
+                       goto bad_fork_free_graph;
        }
 
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1237,7 +1243,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                spin_unlock(&current->sighand->siglock);
                write_unlock_irq(&tasklist_lock);
                retval = -ERESTARTNOINTR;
-               goto bad_fork_free_pid;
+               goto bad_fork_free_graph;
        }
 
        if (clone_flags & CLONE_THREAD) {
@@ -1274,6 +1280,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        cgroup_post_fork(p);
        return p;
 
+bad_fork_free_graph:
+       ftrace_graph_exit_task(p);
 bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
index 8af10027514bb1cc9cb2702051330e52bf43a533..e10c5c8786a614619c943f5102189fdb428c5ac3 100644 (file)
@@ -122,24 +122,6 @@ struct futex_hash_bucket {
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
 
-/*
- * Take mm->mmap_sem, when futex is shared
- */
-static inline void futex_lock_mm(struct rw_semaphore *fshared)
-{
-       if (fshared)
-               down_read(fshared);
-}
-
-/*
- * Release mm->mmap_sem, when the futex is shared
- */
-static inline void futex_unlock_mm(struct rw_semaphore *fshared)
-{
-       if (fshared)
-               up_read(fshared);
-}
-
 /*
  * We hash on the keys returned from get_futex_key (see below).
  */
@@ -161,6 +143,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
                && key1->both.offset == key2->both.offset);
 }
 
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+static void get_futex_key_refs(union futex_key *key)
+{
+       if (!key->both.ptr)
+               return;
+
+       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+       case FUT_OFF_INODE:
+               atomic_inc(&key->shared.inode->i_count);
+               break;
+       case FUT_OFF_MMSHARED:
+               atomic_inc(&key->private.mm->mm_count);
+               break;
+       }
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_futex_key_refs(union futex_key *key)
+{
+       if (!key->both.ptr)
+               return;
+
+       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+       case FUT_OFF_INODE:
+               iput(key->shared.inode);
+               break;
+       case FUT_OFF_MMSHARED:
+               mmdrop(key->private.mm);
+               break;
+       }
+}
+
 /**
  * get_futex_key - Get parameters which are the keys for a futex.
  * @uaddr: virtual address of the futex
@@ -179,12 +200,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
  * For other futexes, it points to &current->mm->mmap_sem and
  * caller must have taken the reader lock. but NOT any spinlocks.
  */
-static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
-                        union futex_key *key)
+static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
-       struct vm_area_struct *vma;
        struct page *page;
        int err;
 
@@ -208,100 +227,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
+               get_futex_key_refs(key);
                return 0;
        }
-       /*
-        * The futex is hashed differently depending on whether
-        * it's in a shared or private mapping.  So check vma first.
-        */
-       vma = find_extend_vma(mm, address);
-       if (unlikely(!vma))
-               return -EFAULT;
 
-       /*
-        * Permissions.
-        */
-       if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
-               return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+again:
+       err = get_user_pages_fast(address, 1, 0, &page);
+       if (err < 0)
+               return err;
+
+       lock_page(page);
+       if (!page->mapping) {
+               unlock_page(page);
+               put_page(page);
+               goto again;
+       }
 
        /*
         * Private mappings are handled in a simple way.
         *
         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
         * it's a read-only handle, it's expected that futexes attach to
-        * the object not the particular process.  Therefore we use
-        * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
-        * mappings of _writable_ handles.
+        * the object not the particular process.
         */
-       if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
-               key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+       if (PageAnon(page)) {
+               key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;
-               return 0;
+       } else {
+               key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+               key->shared.inode = page->mapping->host;
+               key->shared.pgoff = page->index;
        }
 
-       /*
-        * Linear file mappings are also simple.
-        */
-       key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
-       key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
-       if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
-               key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
-                                    + vma->vm_pgoff);
-               return 0;
-       }
+       get_futex_key_refs(key);
 
-       /*
-        * We could walk the page table to read the non-linear
-        * pte, and get the page index without fetching the page
-        * from swap.  But that's a lot of code to duplicate here
-        * for a rare case, so we simply fetch the page.
-        */
-       err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
-       if (err >= 0) {
-               key->shared.pgoff =
-                       page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-               put_page(page);
-               return 0;
-       }
-       return err;
-}
-
-/*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
-static void get_futex_key_refs(union futex_key *key)
-{
-       if (key->both.ptr == NULL)
-               return;
-       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-               case FUT_OFF_INODE:
-                       atomic_inc(&key->shared.inode->i_count);
-                       break;
-               case FUT_OFF_MMSHARED:
-                       atomic_inc(&key->private.mm->mm_count);
-                       break;
-       }
+       unlock_page(page);
+       put_page(page);
+       return 0;
 }
 
-/*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
- */
-static void drop_futex_key_refs(union futex_key *key)
+static inline
+void put_futex_key(int fshared, union futex_key *key)
 {
-       if (!key->both.ptr)
-               return;
-       switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
-               case FUT_OFF_INODE:
-                       iput(key->shared.inode);
-                       break;
-               case FUT_OFF_MMSHARED:
-                       mmdrop(key->private.mm);
-                       break;
-       }
+       drop_futex_key_refs(key);
 }
 
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +297,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 
 /*
  * Fault handling.
- * if fshared is non NULL, current->mm->mmap_sem is already held
  */
-static int futex_handle_fault(unsigned long address,
-                             struct rw_semaphore *fshared, int attempt)
+static int futex_handle_fault(unsigned long address, int attempt)
 {
        struct vm_area_struct * vma;
        struct mm_struct *mm = current->mm;
@@ -340,8 +307,7 @@ static int futex_handle_fault(unsigned long address,
        if (attempt > 2)
                return ret;
 
-       if (!fshared)
-               down_read(&mm->mmap_sem);
+       down_read(&mm->mmap_sem);
        vma = find_vma(mm, address);
        if (vma && address >= vma->vm_start &&
            (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +327,7 @@ static int futex_handle_fault(unsigned long address,
                                current->min_flt++;
                }
        }
-       if (!fshared)
-               up_read(&mm->mmap_sem);
+       up_read(&mm->mmap_sem);
        return ret;
 }
 
@@ -385,6 +350,7 @@ static int refill_pi_state_cache(void)
        /* pi_mutex gets initialized later */
        pi_state->owner = NULL;
        atomic_set(&pi_state->refcount, 1);
+       pi_state->key = FUTEX_KEY_INIT;
 
        current->pi_state_cache = pi_state;
 
@@ -462,7 +428,7 @@ void exit_pi_state_list(struct task_struct *curr)
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
        struct futex_hash_bucket *hb;
-       union futex_key key;
+       union futex_key key = FUTEX_KEY_INIT;
 
        if (!futex_cmpxchg_enabled)
                return;
@@ -719,20 +685,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
-                     int nr_wake, u32 bitset)
+static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
        struct plist_head *head;
-       union futex_key key;
+       union futex_key key = FUTEX_KEY_INIT;
        int ret;
 
        if (!bitset)
                return -EINVAL;
 
-       futex_lock_mm(fshared);
-
        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
                goto out;
@@ -760,7 +723,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
 
        spin_unlock(&hb->lock);
 out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key);
        return ret;
 }
 
@@ -769,19 +732,16 @@ out:
  * to this virtual address:
  */
 static int
-futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
-             u32 __user *uaddr2,
+futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
              int nr_wake, int nr_wake2, int op)
 {
-       union futex_key key1, key2;
+       union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head;
        struct futex_q *this, *next;
        int ret, op_ret, attempt = 0;
 
 retryfull:
-       futex_lock_mm(fshared);
-
        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
@@ -826,18 +786,12 @@ retry:
                 */
                if (attempt++) {
                        ret = futex_handle_fault((unsigned long)uaddr2,
-                                                fshared, attempt);
+                                                attempt);
                        if (ret)
                                goto out;
                        goto retry;
                }
 
-               /*
-                * If we would have faulted, release mmap_sem,
-                * fault it in and start all over again.
-                */
-               futex_unlock_mm(fshared);
-
                ret = get_user(dummy, uaddr2);
                if (ret)
                        return ret;
@@ -873,7 +827,8 @@ retry:
        if (hb1 != hb2)
                spin_unlock(&hb2->lock);
 out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key2);
+       put_futex_key(fshared, &key1);
 
        return ret;
 }
@@ -882,19 +837,16 @@ out:
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
-                        u32 __user *uaddr2,
+static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
                         int nr_wake, int nr_requeue, u32 *cmpval)
 {
-       union futex_key key1, key2;
+       union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head1;
        struct futex_q *this, *next;
        int ret, drop_count = 0;
 
  retry:
-       futex_lock_mm(fshared);
-
        ret = get_futex_key(uaddr1, fshared, &key1);
        if (unlikely(ret != 0))
                goto out;
@@ -917,12 +869,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
                        if (hb1 != hb2)
                                spin_unlock(&hb2->lock);
 
-                       /*
-                        * If we would have faulted, release mmap_sem, fault
-                        * it in and start all over again.
-                        */
-                       futex_unlock_mm(fshared);
-
                        ret = get_user(curval, uaddr1);
 
                        if (!ret)
@@ -974,7 +920,8 @@ out_unlock:
                drop_futex_key_refs(&key1);
 
 out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key2);
+       put_futex_key(fshared, &key1);
        return ret;
 }
 
@@ -1096,8 +1043,7 @@ static void unqueue_me_pi(struct futex_q *q)
  * private futexes.
  */
 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
-                               struct task_struct *newowner,
-                               struct rw_semaphore *fshared)
+                               struct task_struct *newowner, int fshared)
 {
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
@@ -1176,7 +1122,7 @@ retry:
 handle_fault:
        spin_unlock(q->lock_ptr);
 
-       ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
+       ret = futex_handle_fault((unsigned long)uaddr, attempt++);
 
        spin_lock(q->lock_ptr);
 
@@ -1200,7 +1146,7 @@ handle_fault:
 
 static long futex_wait_restart(struct restart_block *restart);
 
-static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
+static int futex_wait(u32 __user *uaddr, int fshared,
                      u32 val, ktime_t *abs_time, u32 bitset)
 {
        struct task_struct *curr = current;
@@ -1218,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
        q.pi_state = NULL;
        q.bitset = bitset;
  retry:
-       futex_lock_mm(fshared);
-
+       q.key = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out_release_sem;
@@ -1251,12 +1196,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
        if (unlikely(ret)) {
                queue_unlock(&q, hb);
 
-               /*
-                * If we would have faulted, release mmap_sem, fault it in and
-                * start all over again.
-                */
-               futex_unlock_mm(fshared);
-
                ret = get_user(uval, uaddr);
 
                if (!ret)
@@ -1270,12 +1209,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
        /* Only actually queue if *uaddr contained val.  */
        queue_me(&q, hb);
 
-       /*
-        * Now the futex is queued and we have checked the data, we
-        * don't want to hold mmap_sem while we sleep.
-        */
-       futex_unlock_mm(fshared);
-
        /*
         * There might have been scheduling since the queue_me(), as we
         * cannot hold a spinlock across the get_user() in case it
@@ -1363,7 +1296,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
        queue_unlock(&q, hb);
 
  out_release_sem:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &q.key);
        return ret;
 }
 
@@ -1371,13 +1304,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
 static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-       struct rw_semaphore *fshared = NULL;
+       int fshared = 0;
        ktime_t t;
 
        t.tv64 = restart->futex.time;
        restart->fn = do_no_restart_syscall;
        if (restart->futex.flags & FLAGS_SHARED)
-               fshared = &current->mm->mmap_sem;
+               fshared = 1;
        return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
                                restart->futex.bitset);
 }
@@ -1389,7 +1322,7 @@ static long futex_wait_restart(struct restart_block *restart)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
+static int futex_lock_pi(u32 __user *uaddr, int fshared,
                         int detect, ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
@@ -1412,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
        q.pi_state = NULL;
  retry:
-       futex_lock_mm(fshared);
-
+       q.key = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr, fshared, &q.key);
        if (unlikely(ret != 0))
                goto out_release_sem;
@@ -1502,7 +1434,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
                         * exit to complete.
                         */
                        queue_unlock(&q, hb);
-                       futex_unlock_mm(fshared);
                        cond_resched();
                        goto retry;
 
@@ -1534,12 +1465,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
         */
        queue_me(&q, hb);
 
-       /*
-        * Now the futex is queued and we have checked the data, we
-        * don't want to hold mmap_sem while we sleep.
-        */
-       futex_unlock_mm(fshared);
-
        WARN_ON(!q.pi_state);
        /*
         * Block on the PI mutex:
@@ -1552,7 +1477,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
                ret = ret ? 0 : -EWOULDBLOCK;
        }
 
-       futex_lock_mm(fshared);
        spin_lock(q.lock_ptr);
 
        if (!ret) {
@@ -1618,7 +1542,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
 
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
-       futex_unlock_mm(fshared);
 
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
@@ -1628,7 +1551,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        queue_unlock(&q, hb);
 
  out_release_sem:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &q.key);
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
        return ret;
@@ -1645,15 +1568,12 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
        queue_unlock(&q, hb);
 
        if (attempt++) {
-               ret = futex_handle_fault((unsigned long)uaddr, fshared,
-                                        attempt);
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
                if (ret)
                        goto out_release_sem;
                goto retry_unlocked;
        }
 
-       futex_unlock_mm(fshared);
-
        ret = get_user(uval, uaddr);
        if (!ret && (uval != -EFAULT))
                goto retry;
@@ -1668,13 +1588,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
  */
-static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
+static int futex_unlock_pi(u32 __user *uaddr, int fshared)
 {
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
        u32 uval;
        struct plist_head *head;
-       union futex_key key;
+       union futex_key key = FUTEX_KEY_INIT;
        int ret, attempt = 0;
 
 retry:
@@ -1685,10 +1605,6 @@ retry:
         */
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
-       /*
-        * First take all the futex related locks:
-        */
-       futex_lock_mm(fshared);
 
        ret = get_futex_key(uaddr, fshared, &key);
        if (unlikely(ret != 0))
@@ -1747,7 +1663,7 @@ retry_unlocked:
 out_unlock:
        spin_unlock(&hb->lock);
 out:
-       futex_unlock_mm(fshared);
+       put_futex_key(fshared, &key);
 
        return ret;
 
@@ -1763,16 +1679,13 @@ pi_faulted:
        spin_unlock(&hb->lock);
 
        if (attempt++) {
-               ret = futex_handle_fault((unsigned long)uaddr, fshared,
-                                        attempt);
+               ret = futex_handle_fault((unsigned long)uaddr, attempt);
                if (ret)
                        goto out;
                uval = 0;
                goto retry_unlocked;
        }
 
-       futex_unlock_mm(fshared);
-
        ret = get_user(uval, uaddr);
        if (!ret && (uval != -EFAULT))
                goto retry;
@@ -1898,8 +1811,7 @@ retry:
                 * PI futexes happens in exit_pi_state():
                 */
                if (!pi && (uval & FUTEX_WAITERS))
-                       futex_wake(uaddr, &curr->mm->mmap_sem, 1,
-                                  FUTEX_BITSET_MATCH_ANY);
+                       futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
        }
        return 0;
 }
@@ -1995,10 +1907,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 {
        int ret = -ENOSYS;
        int cmd = op & FUTEX_CMD_MASK;
-       struct rw_semaphore *fshared = NULL;
+       int fshared = 0;
 
        if (!(op & FUTEX_PRIVATE_FLAG))
-               fshared = &current->mm->mmap_sem;
+               fshared = 1;
 
        switch (cmd) {
        case FUTEX_WAIT:
index 681c52dbfe229bbffc32db74c395f3f149738ed4..4dd5b1edac984bca7326297ecee3daf0239301f1 100644 (file)
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
index cc0f7321b8cede4192a4ceb9ffc97311ec9cc7d0..650ce4102a6333d9ddf097692583e55659a25333 100644 (file)
@@ -40,6 +40,9 @@ unsigned long probe_irq_on(void)
         * flush such a longstanding irq before considering it as spurious.
         */
        for_each_irq_desc_reverse(i, desc) {
+               if (!desc)
+                       continue;
+
                spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        /*
@@ -68,6 +71,9 @@ unsigned long probe_irq_on(void)
         * happened in the previous stage, it may have masked itself)
         */
        for_each_irq_desc_reverse(i, desc) {
+               if (!desc)
+                       continue;
+
                spin_lock_irq(&desc->lock);
                if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
                        desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -86,6 +92,9 @@ unsigned long probe_irq_on(void)
         * Now filter out any obviously spurious interrupts
         */
        for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
+
                spin_lock_irq(&desc->lock);
                status = desc->status;
 
@@ -124,6 +133,9 @@ unsigned int probe_irq_mask(unsigned long val)
        int i;
 
        for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
+
                spin_lock_irq(&desc->lock);
                status = desc->status;
 
@@ -166,6 +178,9 @@ int probe_irq_off(unsigned long val)
        unsigned int status;
 
        for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
+
                spin_lock_irq(&desc->lock);
                status = desc->status;
 
index 10b5092e9bfe749da57b2ab408e3f6d38ae81d75..b343deedae914b59399a504854b2a97a105742c8 100644 (file)
  */
 void dynamic_irq_init(unsigned int irq)
 {
-       struct irq_desc *desc = irq_to_desc(irq);
+       struct irq_desc *desc;
        unsigned long flags;
 
+       desc = irq_to_desc(irq);
        if (!desc) {
                WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
                return;
@@ -45,7 +46,7 @@ void dynamic_irq_init(unsigned int irq)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-       cpus_setall(desc->affinity);
+       cpumask_setall(&desc->affinity);
 #endif
        spin_unlock_irqrestore(&desc->lock, flags);
 }
@@ -352,6 +353,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
        spin_lock(&desc->lock);
        mask_ack_irq(desc, irq);
+       desc = irq_remap_to_desc(irq, desc);
 
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -429,6 +431,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        desc->status &= ~IRQ_INPROGRESS;
 out:
        desc->chip->eoi(irq);
+       desc = irq_remap_to_desc(irq, desc);
 
        spin_unlock(&desc->lock);
 }
@@ -465,12 +468,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                    !desc->action)) {
                desc->status |= (IRQ_PENDING | IRQ_MASKED);
                mask_ack_irq(desc, irq);
+               desc = irq_remap_to_desc(irq, desc);
                goto out_unlock;
        }
        kstat_incr_irqs_this_cpu(irq, desc);
 
        /* Start handling the irq */
        desc->chip->ack(irq);
+       desc = irq_remap_to_desc(irq, desc);
 
        /* Mark the IRQ currently in progress.*/
        desc->status |= IRQ_INPROGRESS;
@@ -531,8 +536,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
 
-       if (desc->chip->eoi)
+       if (desc->chip->eoi) {
                desc->chip->eoi(irq);
+               desc = irq_remap_to_desc(irq, desc);
+       }
 }
 
 void
@@ -567,8 +574,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-               if (desc->chip != &no_irq_chip)
+               if (desc->chip != &no_irq_chip) {
                        mask_ack_irq(desc, irq);
+                       desc = irq_remap_to_desc(irq, desc);
+               }
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
index c815b42d0f5bf12baed5e25eb92c7f2d5e4b4472..f1a23069c20ab88fe97879b4888be63508a70027 100644 (file)
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/rculist.h>
+#include <linux/hash.h>
 
 #include "internals.h"
 
+/*
+ * lockdep: we want to handle all irq_desc locks as a single lock-class:
+ */
+struct lock_class_key irq_desc_lock_class;
+
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
  * @irq:       the interrupt number
@@ -49,6 +56,155 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
+void __init __attribute__((weak)) arch_early_irq_init(void)
+{
+}
+
+#ifdef CONFIG_SPARSE_IRQ
+static struct irq_desc irq_desc_init = {
+       .irq        = -1,
+       .status     = IRQ_DISABLED,
+       .chip       = &no_irq_chip,
+       .handle_irq = handle_bad_irq,
+       .depth      = 1,
+       .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+       .affinity   = CPU_MASK_ALL
+#endif
+};
+
+void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+{
+       unsigned long bytes;
+       char *ptr;
+       int node;
+
+       /* Compute how many bytes we need per irq and allocate them */
+       bytes = nr * sizeof(unsigned int);
+
+       node = cpu_to_node(cpu);
+       ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+
+       if (ptr)
+               desc->kstat_irqs = (unsigned int *)ptr;
+}
+
+void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
+{
+}
+
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+{
+       memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
+       desc->irq = irq;
+#ifdef CONFIG_SMP
+       desc->cpu = cpu;
+#endif
+       lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+       init_kstat_irqs(desc, cpu, nr_cpu_ids);
+       if (!desc->kstat_irqs) {
+               printk(KERN_ERR "can not alloc kstat_irqs\n");
+               BUG_ON(1);
+       }
+       arch_init_chip_data(desc, cpu);
+}
+
+/*
+ * Protect the sparse_irqs:
+ */
+DEFINE_SPINLOCK(sparse_irq_lock);
+
+struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+
+static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
+       [0 ... NR_IRQS_LEGACY-1] = {
+               .irq        = -1,
+               .status     = IRQ_DISABLED,
+               .chip       = &no_irq_chip,
+               .handle_irq = handle_bad_irq,
+               .depth      = 1,
+               .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
+#ifdef CONFIG_SMP
+               .affinity   = CPU_MASK_ALL
+#endif
+       }
+};
+
+/* FIXME: use bootmem alloc ...*/
+static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+
+void __init early_irq_init(void)
+{
+       struct irq_desc *desc;
+       int legacy_count;
+       int i;
+
+       desc = irq_desc_legacy;
+       legacy_count = ARRAY_SIZE(irq_desc_legacy);
+
+       for (i = 0; i < legacy_count; i++) {
+               desc[i].irq = i;
+               desc[i].kstat_irqs = kstat_irqs_legacy[i];
+
+               irq_desc_ptrs[i] = desc + i;
+       }
+
+       for (i = legacy_count; i < NR_IRQS; i++)
+               irq_desc_ptrs[i] = NULL;
+
+       arch_early_irq_init();
+}
+
+struct irq_desc *irq_to_desc(unsigned int irq)
+{
+       return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+}
+
+struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+{
+       struct irq_desc *desc;
+       unsigned long flags;
+       int node;
+
+       if (irq >= NR_IRQS) {
+               printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
+                               irq, NR_IRQS);
+               WARN_ON(1);
+               return NULL;
+       }
+
+       desc = irq_desc_ptrs[irq];
+       if (desc)
+               return desc;
+
+       spin_lock_irqsave(&sparse_irq_lock, flags);
+
+       /* We have to check it to avoid races with another CPU */
+       desc = irq_desc_ptrs[irq];
+       if (desc)
+               goto out_unlock;
+
+       node = cpu_to_node(cpu);
+       desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
+                irq, cpu, node);
+       if (!desc) {
+               printk(KERN_ERR "can not alloc irq_desc\n");
+               BUG_ON(1);
+       }
+       init_one_irq_desc(irq, desc, cpu);
+
+       irq_desc_ptrs[irq] = desc;
+
+out_unlock:
+       spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+       return desc;
+}
+
+#else
+
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS-1] = {
                .status = IRQ_DISABLED,
@@ -62,6 +218,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        }
 };
 
+#endif
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
@@ -179,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-               if (desc->chip->ack)
+               if (desc->chip->ack) {
                        desc->chip->ack(irq);
+                       /* get new one */
+                       desc = irq_remap_to_desc(irq, desc);
+               }
                if (likely(!(desc->status & IRQ_DISABLED))) {
                        action_ret = handle_IRQ_event(irq, desc->action);
                        if (!noirqdebug)
@@ -191,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq)
        }
 
        spin_lock(&desc->lock);
-       if (desc->chip->ack)
+       if (desc->chip->ack) {
                desc->chip->ack(irq);
+               desc = irq_remap_to_desc(irq, desc);
+       }
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
@@ -261,17 +424,28 @@ out:
 
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-/*
- * lockdep: we want to handle all irq_desc locks as a single lock-class:
- */
-static struct lock_class_key irq_desc_lock_class;
-
 void early_init_irq_lock_class(void)
 {
+#ifndef CONFIG_SPARSE_IRQ
        struct irq_desc *desc;
        int i;
 
-       for_each_irq_desc(i, desc)
+       for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
+
                lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+       }
+#endif
+}
+#endif
+
+#ifdef CONFIG_SPARSE_IRQ
+unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+       return desc->kstat_irqs[cpu];
 }
 #endif
+EXPORT_SYMBOL(kstat_irqs_cpu);
+
index 64c1c7253dae091b931652aa235ba3c44b6060c5..e6d0a43cc1255c2abb3778fcec5f5b1d4e5290ec 100644 (file)
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
                unsigned long flags);
 
+extern struct lock_class_key irq_desc_lock_class;
+extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern spinlock_t sparse_irq_lock;
+extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
index 801addda3c43d4a7a767b27468fee05232cb6f89..10ad2f87ed9a0cfe6d20844f05231b7f0fd0506a 100644 (file)
@@ -79,7 +79,7 @@ int irq_can_set_affinity(unsigned int irq)
  *     @cpumask:       cpumask
  *
  */
-int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
@@ -91,14 +91,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-               desc->affinity = cpumask;
+               cpumask_copy(&desc->affinity, cpumask);
                desc->chip->set_affinity(irq, cpumask);
        } else {
                desc->status |= IRQ_MOVE_PENDING;
-               desc->pending_mask = cpumask;
+               cpumask_copy(&desc->pending_mask, cpumask);
        }
 #else
-       desc->affinity = cpumask;
+       cpumask_copy(&desc->affinity, cpumask);
        desc->chip->set_affinity(irq, cpumask);
 #endif
        desc->status |= IRQ_AFFINITY_SET;
@@ -112,26 +112,24 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
  */
 int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 {
-       cpumask_t mask;
-
        if (!irq_can_set_affinity(irq))
                return 0;
 
-       cpus_and(mask, cpu_online_map, irq_default_affinity);
-
        /*
         * Preserve an userspace affinity setup, but make sure that
         * one of the targets is online.
         */
        if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-               if (cpus_intersects(desc->affinity, cpu_online_map))
-                       mask = desc->affinity;
+               if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+                   < nr_cpu_ids)
+                       goto set_affinity;
                else
                        desc->status &= ~IRQ_AFFINITY_SET;
        }
 
-       desc->affinity = mask;
-       desc->chip->set_affinity(irq, mask);
+       cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+set_affinity:
+       desc->chip->set_affinity(irq, &desc->affinity);
 
        return 0;
 }
index 9db681d95814baa2662fdccba0755b09579577cf..bd72329e630c6cc6f3958bf4d70484f9124f4330 100644 (file)
@@ -4,7 +4,6 @@
 void move_masked_irq(int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-       cpumask_t tmp;
 
        if (likely(!(desc->status & IRQ_MOVE_PENDING)))
                return;
@@ -19,7 +18,7 @@ void move_masked_irq(int irq)
 
        desc->status &= ~IRQ_MOVE_PENDING;
 
-       if (unlikely(cpus_empty(desc->pending_mask)))
+       if (unlikely(cpumask_empty(&desc->pending_mask)))
                return;
 
        if (!desc->chip->set_affinity)
@@ -27,8 +26,6 @@ void move_masked_irq(int irq)
 
        assert_spin_locked(&desc->lock);
 
-       cpus_and(tmp, desc->pending_mask, cpu_online_map);
-
        /*
         * If there was a valid mask to work with, please
         * do the disable, re-program, enable sequence.
@@ -41,10 +38,13 @@ void move_masked_irq(int irq)
         * For correct operation this depends on the caller
         * masking the irqs.
         */
-       if (likely(!cpus_empty(tmp))) {
-               desc->chip->set_affinity(irq,tmp);
+       if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+                  < nr_cpu_ids)) {
+               cpumask_and(&desc->affinity,
+                           &desc->pending_mask, cpu_online_mask);
+               desc->chip->set_affinity(irq, &desc->affinity);
        }
-       cpus_clear(desc->pending_mask);
+       cpumask_clear(&desc->pending_mask);
 }
 
 void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
new file mode 100644 (file)
index 0000000..0178e22
--- /dev/null
@@ -0,0 +1,127 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+static void init_copy_kstat_irqs(struct irq_desc *old_desc,
+                                struct irq_desc *desc,
+                                int cpu, int nr)
+{
+       unsigned long bytes;
+
+       init_kstat_irqs(desc, cpu, nr);
+
+       if (desc->kstat_irqs != old_desc->kstat_irqs) {
+               /* Compute how many bytes we need per irq and allocate them */
+               bytes = nr * sizeof(unsigned int);
+
+               memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+       }
+}
+
+static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+       if (old_desc->kstat_irqs == desc->kstat_irqs)
+               return;
+
+       kfree(old_desc->kstat_irqs);
+       old_desc->kstat_irqs = NULL;
+}
+
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+                struct irq_desc *desc, int cpu)
+{
+       memcpy(desc, old_desc, sizeof(struct irq_desc));
+       desc->cpu = cpu;
+       lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+       init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+       arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+       free_kstat_irqs(old_desc, desc);
+       arch_free_chip_data(old_desc, desc);
+}
+
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+                                               int cpu)
+{
+       struct irq_desc *desc;
+       unsigned int irq;
+       unsigned long flags;
+       int node;
+
+       irq = old_desc->irq;
+
+       spin_lock_irqsave(&sparse_irq_lock, flags);
+
+       /* We have to check it to avoid races with another CPU */
+       desc = irq_desc_ptrs[irq];
+
+       if (desc && old_desc != desc)
+                       goto out_unlock;
+
+       node = cpu_to_node(cpu);
+       desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+       printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
+                irq, cpu, node);
+       if (!desc) {
+               printk(KERN_ERR "can not get new irq_desc for moving\n");
+               /* still use old one */
+               desc = old_desc;
+               goto out_unlock;
+       }
+       init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+       irq_desc_ptrs[irq] = desc;
+
+       /* free the old one */
+       free_one_irq_desc(old_desc, desc);
+       kfree(old_desc);
+
+out_unlock:
+       spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+       return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+       int old_cpu;
+       int node, old_node;
+
+       /* those all static, do move them */
+       if (desc->irq < NR_IRQS_LEGACY)
+               return desc;
+
+       old_cpu = desc->cpu;
+       printk(KERN_DEBUG
+                "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+       if (old_cpu != cpu) {
+               node = cpu_to_node(cpu);
+               old_node = cpu_to_node(old_cpu);
+               if (old_node != node)
+                       desc = __real_move_irq_desc(desc, cpu);
+               else
+                       desc->cpu = cpu;
+       }
+
+       return desc;
+}
+
index d257e7d6a8a4ba4b10504dcd8490c812fe37504c..d2c0e5ee53c573019f45942e4d63bb362db42551 100644 (file)
@@ -40,33 +40,42 @@ static ssize_t irq_affinity_proc_write(struct file *file,
                const char __user *buffer, size_t count, loff_t *pos)
 {
        unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
-       cpumask_t new_value;
+       cpumask_var_t new_value;
        int err;
 
        if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
            irq_balancing_disabled(irq))
                return -EIO;
 
+       if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+               return -ENOMEM;
+
        err = cpumask_parse_user(buffer, count, new_value);
        if (err)
-               return err;
+               goto free_cpumask;
 
-       if (!is_affinity_mask_valid(new_value))
-               return -EINVAL;
+       if (!is_affinity_mask_valid(*new_value)) {
+               err = -EINVAL;
+               goto free_cpumask;
+       }
 
        /*
         * Do not allow disabling IRQs completely - it's a too easy
         * way to make the system unusable accidentally :-) At least
         * one online CPU still has to be targeted.
         */
-       if (!cpus_intersects(new_value, cpu_online_map))
+       if (!cpumask_intersects(new_value, cpu_online_mask)) {
                /* Special case for empty set - allow the architecture
                   code to set default SMP affinity. */
-               return irq_select_affinity_usr(irq) ? -EINVAL : count;
-
-       irq_set_affinity(irq, new_value);
+               err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+       } else {
+               irq_set_affinity(irq, new_value);
+               err = count;
+       }
 
-       return count;
+free_cpumask:
+       free_cpumask_var(new_value);
+       return err;
 }
 
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
@@ -95,7 +104,7 @@ static ssize_t default_affinity_write(struct file *file,
        cpumask_t new_value;
        int err;
 
-       err = cpumask_parse_user(buffer, count, new_value);
+       err = cpumask_parse_user(buffer, count, &new_value);
        if (err)
                return err;
 
@@ -243,7 +252,11 @@ void init_irq_proc(void)
        /*
         * Create entries for all existing IRQs.
         */
-       for_each_irq_desc(irq, desc)
+       for_each_irq_desc(irq, desc) {
+               if (!desc)
+                       continue;
+
                register_irq_proc(irq, desc);
+       }
 }
 
index dd364c11e56e0f82e0e923fcc3b60d17524e991d..3738107531fd29190b62e628d4d7da8f1081ec96 100644 (file)
@@ -91,6 +91,9 @@ static int misrouted_irq(int irq)
        int i, ok = 0;
 
        for_each_irq_desc(i, desc) {
+               if (!desc)
+                       continue;
+
                if (!i)
                         continue;
 
@@ -112,6 +115,8 @@ static void poll_spurious_irqs(unsigned long dummy)
        for_each_irq_desc(i, desc) {
                unsigned int status;
 
+               if (!desc)
+                       continue;
                if (!i)
                         continue;
 
index 8e7a7ce3ed0a642f99dc7f73c220722c936e88ac..4fbc456f393d0b1fb328667d9e93cc214c39da06 100644 (file)
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
+DEFINE_TRACE(sched_kthread_stop);
+DEFINE_TRACE(sched_kthread_stop_ret);
+
 struct kthread_create_info
 {
        /* Information passed to kthread() from kthreadd. */
index 46a404173db231a982baf3941c72e96911003906..c4c7df23f8c774eec4130b22f04c9c6fa3b1e1e7 100644 (file)
@@ -25,6 +25,7 @@
  * Thanks to Arjan van de Ven for coming up with the initial idea of
  * mapping lock dependencies runtime.
  */
+#define DISABLE_BRANCH_PROFILING
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
@@ -136,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
-static int lock_contention_point(struct lock_class *class, unsigned long ip)
+static int lock_point(unsigned long points[], unsigned long ip)
 {
        int i;
 
-       for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
-               if (class->contention_point[i] == 0) {
-                       class->contention_point[i] = ip;
+       for (i = 0; i < LOCKSTAT_POINTS; i++) {
+               if (points[i] == 0) {
+                       points[i] = ip;
                        break;
                }
-               if (class->contention_point[i] == ip)
+               if (points[i] == ip)
                        break;
        }
 
@@ -185,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
                for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
                        stats.contention_point[i] += pcs->contention_point[i];
 
+               for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
+                       stats.contending_point[i] += pcs->contending_point[i];
+
                lock_time_add(&pcs->read_waittime, &stats.read_waittime);
                lock_time_add(&pcs->write_waittime, &stats.write_waittime);
 
@@ -209,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
                memset(cpu_stats, 0, sizeof(struct lock_class_stats));
        }
        memset(class->contention_point, 0, sizeof(class->contention_point));
+       memset(class->contending_point, 0, sizeof(class->contending_point));
 }
 
 static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -2999,7 +3004,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
        struct held_lock *hlock, *prev_hlock;
        struct lock_class_stats *stats;
        unsigned int depth;
-       int i, point;
+       int i, contention_point, contending_point;
 
        depth = curr->lockdep_depth;
        if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3023,18 +3028,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 found_it:
        hlock->waittime_stamp = sched_clock();
 
-       point = lock_contention_point(hlock_class(hlock), ip);
+       contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
+       contending_point = lock_point(hlock_class(hlock)->contending_point,
+                                     lock->ip);
 
        stats = get_lock_stats(hlock_class(hlock));
-       if (point < ARRAY_SIZE(stats->contention_point))
-               stats->contention_point[point]++;
+       if (contention_point < LOCKSTAT_POINTS)
+               stats->contention_point[contention_point]++;
+       if (contending_point < LOCKSTAT_POINTS)
+               stats->contending_point[contending_point]++;
        if (lock->cpu != smp_processor_id())
                stats->bounces[bounce_contended + !!hlock->read]++;
        put_lock_stats(stats);
 }
 
 static void
-__lock_acquired(struct lockdep_map *lock)
+__lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
        struct task_struct *curr = current;
        struct held_lock *hlock, *prev_hlock;
@@ -3083,6 +3092,7 @@ found_it:
        put_lock_stats(stats);
 
        lock->cpu = cpu;
+       lock->ip = ip;
 }
 
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3104,7 +3114,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
-void lock_acquired(struct lockdep_map *lock)
+void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
 
@@ -3117,7 +3127,7 @@ void lock_acquired(struct lockdep_map *lock)
        raw_local_irq_save(flags);
        check_flags(flags);
        current->lockdep_recursion = 1;
-       __lock_acquired(lock);
+       __lock_acquired(lock, ip);
        current->lockdep_recursion = 0;
        raw_local_irq_restore(flags);
 }
index 20dbcbf9c7dd2cf34486f3e9f307f1e3cf8a97bc..13716b8138961ee9f5feffe23ad9115fc27766e3 100644 (file)
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
 
 static void snprint_time(char *buf, size_t bufsiz, s64 nr)
 {
-       unsigned long rem;
+       s64 div;
+       s32 rem;
 
        nr += 5; /* for display rounding */
-       rem = do_div(nr, 1000); /* XXX: do_div_signed */
-       snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
+       div = div_s64_rem(nr, 1000, &rem);
+       snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
 }
 
 static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
        if (stats->read_holdtime.nr)
                namelen += 2;
 
-       for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) {
+       for (i = 0; i < LOCKSTAT_POINTS; i++) {
                char sym[KSYM_SYMBOL_LEN];
                char ip[32];
 
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
                                stats->contention_point[i],
                                ip, sym);
        }
+       for (i = 0; i < LOCKSTAT_POINTS; i++) {
+               char sym[KSYM_SYMBOL_LEN];
+               char ip[32];
+
+               if (class->contending_point[i] == 0)
+                       break;
+
+               if (!i)
+                       seq_line(m, '-', 40-namelen, namelen);
+
+               sprint_symbol(sym, class->contending_point[i]);
+               snprintf(ip, sizeof(ip), "[<%p>]",
+                               (void *)class->contending_point[i]);
+               seq_printf(m, "%40s %14lu %29s %s\n", name,
+                               stats->contending_point[i],
+                               ip, sym);
+       }
        if (i) {
                seq_puts(m, "\n");
                seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
 
 static void seq_header(struct seq_file *m)
 {
-       seq_printf(m, "lock_stat version 0.2\n");
+       seq_printf(m, "lock_stat version 0.3\n");
        seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
        seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
                        "%14s %14s\n",
index e9c6b2bc9400627cf183382ee55933333f0ee83b..ea54f2647868726428faa630114d6fcd673c4735 100644 (file)
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
  */
 #define MARKER_HASH_BITS 6
 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
 
 /*
  * Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
        void *oldptr;
        int rcu_pending;
        unsigned char ptype:1;
+       unsigned char format_allocated:1;
        char name[0];   /* Contains name'\0'format'\0' */
 };
 
-static struct hlist_head marker_table[MARKER_TABLE_SIZE];
-
 /**
  * __mark_empty_function - Empty probe callback
  * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
  * though the function pointer change and the marker enabling are two distinct
  * operations that modifies the execution flow of preemptible code.
  */
-void __mark_empty_function(void *probe_private, void *call_private,
+notrace void __mark_empty_function(void *probe_private, void *call_private,
        const char *fmt, va_list *args)
 {
 }
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
  * need to put a full smp_rmb() in this branch. This is why we do not use
  * rcu_dereference() for the pointer read.
  */
-void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
+notrace void marker_probe_cb(const struct marker *mdata,
+               void *call_private, ...)
 {
        va_list args;
        char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
         * sure the teardown of the callbacks can be done correctly when they
         * are in modules and they insure RCU read coherency.
         */
-       rcu_read_lock_sched();
+       rcu_read_lock_sched_notrace();
        ptype = mdata->ptype;
        if (likely(!ptype)) {
                marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
                        va_end(args);
                }
        }
-       rcu_read_unlock_sched();
+       rcu_read_unlock_sched_notrace();
 }
 EXPORT_SYMBOL_GPL(marker_probe_cb);
 
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
  *
  * Should be connected to markers "MARK_NOARGS".
  */
-void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+               void *call_private, ...)
 {
        va_list args;   /* not initialized */
        char ptype;
 
-       rcu_read_lock_sched();
+       rcu_read_lock_sched_notrace();
        ptype = mdata->ptype;
        if (likely(!ptype)) {
                marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
                        multi[i].func(multi[i].probe_private, call_private,
                                mdata->format, &args);
        }
-       rcu_read_unlock_sched();
+       rcu_read_unlock_sched_notrace();
 }
-EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
 
 static void free_old_closure(struct rcu_head *head)
 {
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
        e->single.probe_private = NULL;
        e->multi = NULL;
        e->ptype = 0;
+       e->format_allocated = 0;
        e->refcount = 0;
        e->rcu_pending = 0;
        hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
        if (e->single.func != __mark_empty_function)
                return -EBUSY;
        hlist_del(&e->hlist);
+       if (e->format_allocated)
+               kfree(e->format);
        /* Make sure the call_rcu has been executed */
        if (e->rcu_pending)
                rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
 /*
  * Set the mark_entry format to the format found in the element.
  */
-static int marker_set_format(struct marker_entry **entry, const char *format)
+static int marker_set_format(struct marker_entry *entry, const char *format)
 {
-       struct marker_entry *e;
-       size_t name_len = strlen((*entry)->name) + 1;
-       size_t format_len = strlen(format) + 1;
-
-
-       e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
-                       GFP_KERNEL);
-       if (!e)
+       entry->format = kstrdup(format, GFP_KERNEL);
+       if (!entry->format)
                return -ENOMEM;
-       memcpy(&e->name[0], (*entry)->name, name_len);
-       e->format = &e->name[name_len];
-       memcpy(e->format, format, format_len);
-       if (strcmp(e->format, MARK_NOARGS) == 0)
-               e->call = marker_probe_cb_noarg;
-       else
-               e->call = marker_probe_cb;
-       e->single = (*entry)->single;
-       e->multi = (*entry)->multi;
-       e->ptype = (*entry)->ptype;
-       e->refcount = (*entry)->refcount;
-       e->rcu_pending = 0;
-       hlist_add_before(&e->hlist, &(*entry)->hlist);
-       hlist_del(&(*entry)->hlist);
-       /* Make sure the call_rcu has been executed */
-       if ((*entry)->rcu_pending)
-               rcu_barrier_sched();
-       kfree(*entry);
-       *entry = e;
+       entry->format_allocated = 1;
+
        trace_mark(core_marker_format, "name %s format %s",
-                       e->name, e->format);
+                       entry->name, entry->format);
        return 0;
 }
 
 /*
  * Sets the probe callback corresponding to one marker.
  */
-static int set_marker(struct marker_entry **entry, struct marker *elem,
+static int set_marker(struct marker_entry *entry, struct marker *elem,
                int active)
 {
-       int ret;
-       WARN_ON(strcmp((*entry)->name, elem->name) != 0);
+       int ret = 0;
+       WARN_ON(strcmp(entry->name, elem->name) != 0);
 
-       if ((*entry)->format) {
-               if (strcmp((*entry)->format, elem->format) != 0) {
+       if (entry->format) {
+               if (strcmp(entry->format, elem->format) != 0) {
                        printk(KERN_NOTICE
                                "Format mismatch for probe %s "
                                "(%s), marker (%s)\n",
-                               (*entry)->name,
-                               (*entry)->format,
+                               entry->name,
+                               entry->format,
                                elem->format);
                        return -EPERM;
                }
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
         * pass from a "safe" callback (with argument) to an "unsafe"
         * callback (does not set arguments).
         */
-       elem->call = (*entry)->call;
+       elem->call = entry->call;
        /*
         * Sanity check :
         * We only update the single probe private data when the ptr is
         * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
         */
        WARN_ON(elem->single.func != __mark_empty_function
-               && elem->single.probe_private
-               != (*entry)->single.probe_private &&
-               !elem->ptype);
-       elem->single.probe_private = (*entry)->single.probe_private;
+               && elem->single.probe_private != entry->single.probe_private
+               && !elem->ptype);
+       elem->single.probe_private = entry->single.probe_private;
        /*
         * Make sure the private data is valid when we update the
         * single probe ptr.
         */
        smp_wmb();
-       elem->single.func = (*entry)->single.func;
+       elem->single.func = entry->single.func;
        /*
         * We also make sure that the new probe callbacks array is consistent
         * before setting a pointer to it.
         */
-       rcu_assign_pointer(elem->multi, (*entry)->multi);
+       rcu_assign_pointer(elem->multi, entry->multi);
        /*
         * Update the function or multi probe array pointer before setting the
         * ptype.
         */
        smp_wmb();
-       elem->ptype = (*entry)->ptype;
+       elem->ptype = entry->ptype;
+
+       if (elem->tp_name && (active ^ elem->state)) {
+               WARN_ON(!elem->tp_cb);
+               /*
+                * It is ok to directly call the probe registration because type
+                * checking has been done in the __trace_mark_tp() macro.
+                */
+
+               if (active) {
+                       /*
+                        * try_module_get should always succeed because we hold
+                        * lock_module() to get the tp_cb address.
+                        */
+                       ret = try_module_get(__module_text_address(
+                               (unsigned long)elem->tp_cb));
+                       BUG_ON(!ret);
+                       ret = tracepoint_probe_register_noupdate(
+                               elem->tp_name,
+                               elem->tp_cb);
+               } else {
+                       ret = tracepoint_probe_unregister_noupdate(
+                               elem->tp_name,
+                               elem->tp_cb);
+                       /*
+                        * tracepoint_probe_update_all() must be called
+                        * before the module containing tp_cb is unloaded.
+                        */
+                       module_put(__module_text_address(
+                               (unsigned long)elem->tp_cb));
+               }
+       }
        elem->state = active;
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
  */
 static void disable_marker(struct marker *elem)
 {
+       int ret;
+
        /* leave "call" as is. It is known statically. */
+       if (elem->tp_name && elem->state) {
+               WARN_ON(!elem->tp_cb);
+               /*
+                * It is ok to directly call the probe registration because type
+                * checking has been done in the __trace_mark_tp() macro.
+                */
+               ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+                       elem->tp_cb);
+               WARN_ON(ret);
+               /*
+                * tracepoint_probe_update_all() must be called
+                * before the module containing tp_cb is unloaded.
+                */
+               module_put(__module_text_address((unsigned long)elem->tp_cb));
+       }
        elem->state = 0;
        elem->single.func = __mark_empty_function;
        /* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
        for (iter = begin; iter < end; iter++) {
                mark_entry = get_marker(iter->name);
                if (mark_entry) {
-                       set_marker(&mark_entry, iter,
-                                       !!mark_entry->refcount);
+                       set_marker(mark_entry, iter, !!mark_entry->refcount);
                        /*
                         * ignore error, continue
                         */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
        marker_update_probe_range(__start___markers, __stop___markers);
        /* Markers in modules. */
        module_update_markers();
+       tracepoint_probe_update_all();
 }
 
 /**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
                        ret = PTR_ERR(entry);
        } else if (format) {
                if (!entry->format)
-                       ret = marker_set_format(&entry, format);
+                       ret = marker_set_format(entry, format);
                else if (strcmp(entry->format, format))
                        ret = -EPERM;
        }
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
                goto end;
        }
        mutex_unlock(&markers_mutex);
-       marker_update_probes();         /* may update entry */
+       marker_update_probes();
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
-       WARN_ON(!entry);
+       if (!entry)
+               goto end;
        if (entry->rcu_pending)
                rcu_barrier_sched();
        entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, probe, probe_private);
        mutex_unlock(&markers_mutex);
-       marker_update_probes();         /* may update entry */
+       marker_update_probes();
        mutex_lock(&markers_mutex);
        entry = get_marker(name);
        if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
                rcu_barrier_sched();
        old = marker_entry_remove_probe(entry, NULL, probe_private);
        mutex_unlock(&markers_mutex);
-       marker_update_probes();         /* may update entry */
+       marker_update_probes();
        mutex_lock(&markers_mutex);
        entry = get_marker_from_private_data(probe, probe_private);
-       WARN_ON(!entry);
+       if (!entry)
+               goto end;
        if (entry->rcu_pending)
                rcu_barrier_sched();
        entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
                        if (!e->ptype) {
                                if (num == 0 && e->single.func == probe)
                                        return e->single.probe_private;
-                               else
-                                       break;
                        } else {
                                struct marker_probe_closure *closure;
                                int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
                                                return closure[i].probe_private;
                                }
                        }
+                       break;
                }
        }
        return ERR_PTR(-ENOENT);
 }
 EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+#ifdef CONFIG_MODULES
+
+int marker_module_notify(struct notifier_block *self,
+                        unsigned long val, void *data)
+{
+       struct module *mod = data;
+
+       switch (val) {
+       case MODULE_STATE_COMING:
+               marker_update_probe_range(mod->markers,
+                       mod->markers + mod->num_markers);
+               break;
+       case MODULE_STATE_GOING:
+               marker_update_probe_range(mod->markers,
+                       mod->markers + mod->num_markers);
+               break;
+       }
+       return 0;
+}
+
+struct notifier_block marker_module_nb = {
+       .notifier_call = marker_module_notify,
+       .priority = 0,
+};
+
+static int init_markers(void)
+{
+       return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
+
+#endif /* CONFIG_MODULES */
index 1f4cc00e0c200b7c69272694d121558fd6a10d06..dd2a54155b54ab45df7ac5a9e7e13344e4611b1e 100644 (file)
@@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod,
                struct mod_debug *debug;
                unsigned int num_debug;
 
-#ifdef CONFIG_MARKERS
-               marker_update_probe_range(mod->markers,
-                       mod->markers + mod->num_markers);
-#endif
                debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
                                     sizeof(*debug), &num_debug);
                dynamic_printk_setup(debug, num_debug);
-
-#ifdef CONFIG_TRACEPOINTS
-               tracepoint_update_probe_range(mod->tracepoints,
-                       mod->tracepoints + mod->num_tracepoints);
-#endif
        }
 
        /* sechdrs[0].sh_size is always zero */
        mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
                            sizeof(*mseg), &num_mcount);
-       ftrace_init_module(mseg, mseg + num_mcount);
+       ftrace_init_module(mod, mseg, mseg + num_mcount);
 
        err = module_finalize(hdr, sechdrs, mod);
        if (err < 0)
@@ -2713,7 +2704,7 @@ int is_module_address(unsigned long addr)
 
 
 /* Is this a valid kernel address? */
-struct module *__module_text_address(unsigned long addr)
+__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 {
        struct module *mod;
 
index 12c779dc65d48a56f1941cc41d73f9593828cf0a..4f45d4b658ef6ab1fda9357725b01361a6768a76 100644 (file)
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
  * We also put the fastpath first in the kernel image, to make sure the
  * branch is predicted by the CPU as default-untaken.
  */
-static void noinline __sched
+static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count);
 
 /***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
-static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
 /***
  * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
        }
 
 done:
-       lock_acquired(&lock->dep_map);
+       lock_acquired(&lock->dep_map, ip);
        /* got the lock - rejoice! */
        mutex_remove_waiter(lock, &waiter, task_thread_info(task));
        debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
  * Release the lock, slowpath:
  */
-static noinline void
+static __used noinline void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
        __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
-static noinline void __sched
+static __used noinline void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
        struct mutex *lock = container_of(lock_count, struct mutex, count);
index 4282c0a40a57ada651b86c7dcce2389abf489448..61d5aa5eced3466393582e4f566b63c468ea7cc3 100644 (file)
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
 
        while (nb && nr_to_call) {
                next_nb = rcu_dereference(nb->next);
+
+#ifdef CONFIG_DEBUG_NOTIFIERS
+               if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
+                       WARN(1, "Invalid notifier called!");
+                       nb = next_nb;
+                       continue;
+               }
+#endif
                ret = nb->notifier_call(nb, val, v);
 
                if (nr_calls)
index 4e5288a831de2e696260c12f97d60fdbcfc551a3..157de3a478321ab80c8c05cee5ffd07da68bab39 100644 (file)
@@ -58,21 +58,21 @@ void thread_group_cputime(
        struct task_struct *tsk,
        struct task_cputime *times)
 {
-       struct signal_struct *sig;
+       struct task_cputime *totals, *tot;
        int i;
-       struct task_cputime *tot;
 
-       sig = tsk->signal;
-       if (unlikely(!sig) || !sig->cputime.totals) {
+       totals = tsk->signal->cputime.totals;
+       if (!totals) {
                times->utime = tsk->utime;
                times->stime = tsk->stime;
                times->sum_exec_runtime = tsk->se.sum_exec_runtime;
                return;
        }
+
        times->stime = times->utime = cputime_zero;
        times->sum_exec_runtime = 0;
        for_each_possible_cpu(i) {
-               tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
+               tot = per_cpu_ptr(totals, i);
                times->utime = cputime_add(times->utime, tot->utime);
                times->stime = cputime_add(times->stime, tot->stime);
                times->sum_exec_runtime += tot->sum_exec_runtime;
index c9d74083746f8839f96a5a37888302c41245797e..f77d3819ef57b2407012fbbf431f27add286fe91 100644 (file)
@@ -22,7 +22,6 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
-#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
 
 int hibernation_snapshot(int platform_mode)
 {
-       int error, ftrace_save;
+       int error;
 
        /* Free memory before shutting down devices. */
        error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
                goto Close;
 
        suspend_console();
-       ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_FREEZE);
        if (error)
                goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
        device_resume(in_suspend ?
                (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
-       __ftrace_enabled_restore(ftrace_save);
        resume_console();
  Close:
        platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
 
 int hibernation_restore(int platform_mode)
 {
-       int error, ftrace_save;
+       int error;
 
        pm_prepare_console();
        suspend_console();
-       ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_QUIESCE);
        if (error)
                goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
        platform_restore_cleanup(platform_mode);
        device_resume(PMSG_RECOVER);
  Finish:
-       __ftrace_enabled_restore(ftrace_save);
        resume_console();
        pm_restore_console();
        return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
 
 int hibernation_platform_enter(void)
 {
-       int error, ftrace_save;
+       int error;
 
        if (!hibernation_ops)
                return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
                goto Close;
 
        suspend_console();
-       ftrace_save = __ftrace_enabled_save();
        error = device_suspend(PMSG_HIBERNATE);
        if (error) {
                if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
        hibernation_ops->finish();
  Resume_devices:
        device_resume(PMSG_RESTORE);
-       __ftrace_enabled_restore(ftrace_save);
        resume_console();
  Close:
        hibernation_ops->end();
index b8f7ce9473e8412099e476dd54a27a1a21bf8ea6..613f16941b853a4ef949eadc2bc9e54668162e70 100644 (file)
@@ -22,7 +22,6 @@
 #include <linux/freezer.h>
 #include <linux/vmstat.h>
 #include <linux/syscalls.h>
-#include <linux/ftrace.h>
 
 #include "power.h"
 
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
  */
 int suspend_devices_and_enter(suspend_state_t state)
 {
-       int error, ftrace_save;
+       int error;
 
        if (!suspend_ops)
                return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
                        goto Close;
        }
        suspend_console();
-       ftrace_save = __ftrace_enabled_save();
        suspend_test_start();
        error = device_suspend(PMSG_SUSPEND);
        if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
        suspend_test_start();
        device_resume(PMSG_RESUME);
        suspend_test_finish("resume devices");
-       __ftrace_enabled_restore(ftrace_save);
        resume_console();
  Close:
        if (suspend_ops->end)
index dc41827fbfeea474c809ebc3adb6e54edcedb861..4cb7d68fed82aabce7f74da9ecf30f3bb54b77e1 100644 (file)
@@ -442,7 +442,7 @@ void profile_tick(int type)
 static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
                        int count, int *eof, void *data)
 {
-       int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
+       int len = cpumask_scnprintf(page, count, (cpumask_t *)data);
        if (count - len < 2)
                return -EINVAL;
        len += sprintf(page + len, "\n");
@@ -456,7 +456,7 @@ static int prof_cpu_mask_write_proc(struct file *file,
        unsigned long full_count = count, err;
        cpumask_t new_value;
 
-       err = cpumask_parse_user(buffer, count, new_value);
+       err = cpumask_parse_user(buffer, count, &new_value);
        if (err)
                return err;
 
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
 };
 
 #ifdef CONFIG_SMP
-static inline void profile_nop(void *unused)
+static void profile_nop(void *unused)
 {
 }
 
index 37f72e551542234d2d7d905741e8a5f6fb7532ec..c03ca3e619190c44292aed969c6c58f947e0b35a 100644 (file)
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
        /* OK, time to rat on our buddy... */
 
-       printk(KERN_ERR "RCU detected CPU stalls:");
+       printk(KERN_ERR "INFO: RCU detected CPU stalls:");
        for_each_possible_cpu(cpu) {
                if (cpu_isset(cpu, rcp->cpumask))
                        printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 {
        unsigned long flags;
 
-       printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+       printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
                        smp_processor_id(), jiffies,
                        jiffies - rcp->gp_start);
        dump_stack();
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
                 * unnecessarily.
                 */
                smp_mb();
-               cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+               cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
 
                rcp->signaled = 0;
        }
index e4bb1dd7b308fce6f47e736b70425f4bc3482c0a..b309027bf9e8bf4c57ba2627fc82ad618149df78 100644 (file)
  */
 #define RUNTIME_INF    ((u64)~0ULL)
 
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
 #ifdef CONFIG_SMP
 /*
  * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -261,6 +267,10 @@ struct task_group {
        struct cgroup_subsys_state css;
 #endif
 
+#ifdef CONFIG_USER_SCHED
+       uid_t uid;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
@@ -286,6 +296,12 @@ struct task_group {
 
 #ifdef CONFIG_USER_SCHED
 
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+       user->tg->uid = user->uid;
+}
+
 /*
  * Root task group.
  *     Every UID task group (including init_task_group aka UID-0) will
@@ -481,14 +497,14 @@ struct rt_rq {
  */
 struct root_domain {
        atomic_t refcount;
-       cpumask_t span;
-       cpumask_t online;
+       cpumask_var_t span;
+       cpumask_var_t online;
 
        /*
         * The "RT overload" flag: it gets set if a CPU has more than
         * one runnable RT task.
         */
-       cpumask_t rto_mask;
+       cpumask_var_t rto_mask;
        atomic_t rto_count;
 #ifdef CONFIG_SMP
        struct cpupri cpupri;
@@ -703,45 +719,18 @@ static __read_mostly char *sched_feat_names[] = {
 
 #undef SCHED_FEAT
 
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
 {
-       filp->private_data = inode->i_private;
-       return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-       char *buf;
-       int r = 0;
-       int len = 0;
        int i;
 
        for (i = 0; sched_feat_names[i]; i++) {
-               len += strlen(sched_feat_names[i]);
-               len += 4;
-       }
-
-       buf = kmalloc(len + 2, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (sysctl_sched_features & (1UL << i))
-                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-               else
-                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
        }
+       seq_puts(m, "\n");
 
-       r += sprintf(buf + r, "\n");
-       WARN_ON(r >= len + 2);
-
-       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
-       kfree(buf);
-
-       return r;
+       return 0;
 }
 
 static ssize_t
@@ -786,10 +775,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
 
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_feat_show, NULL);
+}
+
 static struct file_operations sched_feat_fops = {
-       .open   = sched_feat_open,
-       .read   = sched_feat_read,
-       .write  = sched_feat_write,
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
 };
 
 static __init int sched_init_debug(void)
@@ -1474,27 +1470,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-       int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
 
        if (!tg->se[cpu])
                return;
 
-       rq_weight = tg->cfs_rq[cpu]->load.weight;
-
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       if (unlikely(rq_weight > sd_rq_weight))
-               rq_weight = sd_rq_weight;
+       rq_weight = tg->cfs_rq[cpu]->rq_weight;
 
        /*
         *           \Sum shares * rq_weight
@@ -1502,7 +1484,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
         *               \Sum rq_weight
         *
         */
-       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
        if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1493,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
 
                spin_lock_irqsave(&rq->lock, flags);
-               /*
-                * record the actual number of shares, not the boosted amount.
-                */
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               tg->cfs_rq[cpu]->rq_weight = rq_weight;
+               tg->cfs_rq[cpu]->shares = shares;
 
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1507,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-       unsigned long rq_weight = 0;
+       unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
 
-       for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
+       for_each_cpu(i, sched_domain_span(sd)) {
+               /*
+                * If there are currently no tasks on the cpu pretend there
+                * is one of average load so that when a new task gets to
+                * run here it will not get delayed by group starvation.
+                */
+               weight = tg->cfs_rq[i]->load.weight;
+               if (!weight)
+                       weight = NICE_0_LOAD;
+
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
 
@@ -1545,10 +1533,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
 
-       if (!rq_weight)
-               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
-       for_each_cpu_mask(i, sd->span)
+       for_each_cpu(i, sched_domain_span(sd))
                update_group_shares_cpu(tg, i, shares, rq_weight);
 
        return 0;
@@ -1612,6 +1597,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       int ret = 0;
+
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+{
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2079,15 +2097,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                int i;
 
                /* Skip over this group if it has no CPUs allowed */
-               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
                        continue;
 
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
 
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
 
-               for_each_cpu_mask_nr(i, group->cpumask) {
+               for_each_cpu(i, sched_group_cpus(group)) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
@@ -2119,17 +2139,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
  * find_idlest_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-               cpumask_t *tmp)
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
        unsigned long load, min_load = ULONG_MAX;
        int idlest = -1;
        int i;
 
        /* Traverse only the allowed CPUs */
-       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-
-       for_each_cpu_mask_nr(i, *tmp) {
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                load = weighted_cpuload(i);
 
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2171,7 +2188,6 @@ static int sched_balance_self(int cpu, int flag)
                update_shares(sd);
 
        while (sd) {
-               cpumask_t span, tmpmask;
                struct sched_group *group;
                int new_cpu, weight;
 
@@ -2180,14 +2196,13 @@ static int sched_balance_self(int cpu, int flag)
                        continue;
                }
 
-               span = sd->span;
                group = find_idlest_group(sd, t, cpu);
                if (!group) {
                        sd = sd->child;
                        continue;
                }
 
-               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                if (new_cpu == -1 || new_cpu == cpu) {
                        /* Now try balancing at a lower domain level of cpu */
                        sd = sd->child;
@@ -2196,10 +2211,10 @@ static int sched_balance_self(int cpu, int flag)
 
                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
                sd = NULL;
-               weight = cpus_weight(span);
                for_each_domain(cpu, tmp) {
-                       if (weight <= cpus_weight(tmp->span))
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                break;
                        if (tmp->flags & flag)
                                sd = tmp;
@@ -2244,7 +2259,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                cpu = task_cpu(p);
 
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                update_shares(sd);
                                break;
                        }
@@ -2292,7 +2307,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
        else {
                struct sched_domain *sd;
                for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                schedstat_inc(sd, ttwu_wake_remote);
                                break;
                        }
@@ -2811,40 +2826,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                __release(rq2->lock);
 }
 
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
-{
-       int ret = 0;
-
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
-}
-
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
-{
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -2858,7 +2839,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
        struct rq *rq;
 
        rq = task_rq_lock(p, &flags);
-       if (!cpu_isset(dest_cpu, p->cpus_allowed)
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
            || unlikely(!cpu_active(dest_cpu)))
                goto out;
 
@@ -2924,7 +2905,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+       if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                schedstat_inc(p, se.nr_failed_migrations_affine);
                return 0;
        }
@@ -3099,7 +3080,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const cpumask_t *cpus, int *balance)
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3135,10 +3116,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                unsigned long sum_avg_load_per_task;
                unsigned long avg_load_per_task;
 
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
 
                if (local_group)
-                       balance_cpu = first_cpu(group->cpumask);
+                       balance_cpu = cpumask_first(sched_group_cpus(group));
 
                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3147,13 +3129,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                max_cpu_load = 0;
                min_cpu_load = ~0UL;
 
-               for_each_cpu_mask_nr(i, group->cpumask) {
-                       struct rq *rq;
-
-                       if (!cpu_isset(i, *cpus))
-                               continue;
-
-                       rq = cpu_rq(i);
+               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                       struct rq *rq = cpu_rq(i);
 
                        if (*sd_idle && rq->nr_running)
                                *sd_idle = 0;
@@ -3264,8 +3241,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 */
                if ((sum_nr_running < min_nr_running) ||
                    (sum_nr_running == min_nr_running &&
-                    first_cpu(group->cpumask) <
-                    first_cpu(group_min->cpumask))) {
+                    cpumask_first(sched_group_cpus(group)) <
+                    cpumask_first(sched_group_cpus(group_min)))) {
                        group_min = group;
                        min_nr_running = sum_nr_running;
                        min_load_per_task = sum_weighted_load /
@@ -3280,8 +3257,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                if (sum_nr_running <= group_capacity - 1) {
                        if (sum_nr_running > leader_nr_running ||
                            (sum_nr_running == leader_nr_running &&
-                            first_cpu(group->cpumask) >
-                             first_cpu(group_leader->cpumask))) {
+                            cpumask_first(sched_group_cpus(group)) >
+                            cpumask_first(sched_group_cpus(group_leader)))) {
                                group_leader = group;
                                leader_nr_running = sum_nr_running;
                        }
@@ -3420,16 +3397,16 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const cpumask_t *cpus)
+                  unsigned long imbalance, const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
        int i;
 
-       for_each_cpu_mask_nr(i, group->cpumask) {
+       for_each_cpu(i, sched_group_cpus(group)) {
                unsigned long wl;
 
-               if (!cpu_isset(i, *cpus))
+               if (!cpumask_test_cpu(i, cpus))
                        continue;
 
                rq = cpu_rq(i);
@@ -3459,7 +3436,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, cpumask_t *cpus)
+                       int *balance, struct cpumask *cpus)
 {
        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
@@ -3467,7 +3444,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct rq *busiest;
        unsigned long flags;
 
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
 
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -3527,8 +3504,8 @@ redo:
 
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                        goto out_balanced;
                }
@@ -3545,7 +3522,8 @@ redo:
                        /* don't kick the migration_thread, if the curr
                         * task on busiest cpu can't be moved to this_cpu
                         */
-                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       if (!cpumask_test_cpu(this_cpu,
+                                             &busiest->curr->cpus_allowed)) {
                                spin_unlock_irqrestore(&busiest->lock, flags);
                                all_pinned = 1;
                                goto out_one_pinned;
@@ -3620,7 +3598,7 @@ out:
  */
 static int
 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       cpumask_t *cpus)
+                       struct cpumask *cpus)
 {
        struct sched_group *group;
        struct rq *busiest = NULL;
@@ -3629,7 +3607,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
        int sd_idle = 0;
        int all_pinned = 0;
 
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
 
        /*
         * When power savings policy is enabled for the parent domain, idle
@@ -3673,8 +3651,8 @@ redo:
                double_unlock_balance(this_rq, busiest);
 
                if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                goto redo;
                }
        }
@@ -3707,9 +3685,12 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
        struct sched_domain *sd;
-       int pulled_task = -1;
+       int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
-       cpumask_t tmpmask;
+       cpumask_var_t tmpmask;
+
+       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+               return;
 
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -3720,7 +3701,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                if (sd->flags & SD_BALANCE_NEWIDLE)
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, &tmpmask);
+                                                          sd, tmpmask);
 
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
@@ -3735,6 +3716,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
+       free_cpumask_var(tmpmask);
 }
 
 /*
@@ -3772,7 +3754,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
        /* Search for an sd spanning us and the target CPU. */
        for_each_domain(target_cpu, sd) {
                if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpu_isset(busiest_cpu, sd->span))
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                break;
        }
 
@@ -3791,10 +3773,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 #ifdef CONFIG_NO_HZ
 static struct {
        atomic_t load_balancer;
-       cpumask_t cpu_mask;
+       cpumask_var_t cpu_mask;
 } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
-       .cpu_mask = CPU_MASK_NONE,
 };
 
 /*
@@ -3822,7 +3803,7 @@ int select_nohz_load_balancer(int stop_tick)
        int cpu = smp_processor_id();
 
        if (stop_tick) {
-               cpu_set(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
                cpu_rq(cpu)->in_nohz_recently = 1;
 
                /*
@@ -3836,7 +3817,7 @@ int select_nohz_load_balancer(int stop_tick)
                }
 
                /* time for ilb owner also to sleep */
-               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                        if (atomic_read(&nohz.load_balancer) == cpu)
                                atomic_set(&nohz.load_balancer, -1);
                        return 0;
@@ -3849,10 +3830,10 @@ int select_nohz_load_balancer(int stop_tick)
                } else if (atomic_read(&nohz.load_balancer) == cpu)
                        return 1;
        } else {
-               if (!cpu_isset(cpu, nohz.cpu_mask))
+               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
 
-               cpu_clear(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.cpu_mask);
 
                if (atomic_read(&nohz.load_balancer) == cpu)
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3880,7 +3861,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
        int need_serialize;
-       cpumask_t tmp;
+       cpumask_var_t tmp;
+
+       /* Fails alloc?  Rebalancing probably not a priority right now. */
+       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+               return;
 
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3905,7 +3890,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                }
 
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -3939,6 +3924,8 @@ out:
         */
        if (likely(update_next_balance))
                rq->next_balance = next_balance;
+
+       free_cpumask_var(tmp);
 }
 
 /*
@@ -3963,12 +3950,13 @@ static void run_rebalance_domains(struct softirq_action *h)
         */
        if (this_rq->idle_at_tick &&
            atomic_read(&nohz.load_balancer) == this_cpu) {
-               cpumask_t cpus = nohz.cpu_mask;
                struct rq *rq;
                int balance_cpu;
 
-               cpu_clear(this_cpu, cpus);
-               for_each_cpu_mask_nr(balance_cpu, cpus) {
+               for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                       if (balance_cpu == this_cpu)
+                               continue;
+
                        /*
                         * If this cpu gets work to do, stop the load balancing
                         * work being done for other cpus. Next load
@@ -4006,7 +3994,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                rq->in_nohz_recently = 0;
 
                if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpu_clear(cpu, nohz.cpu_mask);
+                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
                        atomic_set(&nohz.load_balancer, -1);
                }
 
@@ -4019,7 +4007,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                         * TBD: Traverse the sched domains and nominate
                         * the nearest cpu in the nohz.cpu_mask.
                         */
-                       int ilb = first_cpu(nohz.cpu_mask);
+                       int ilb = cpumask_first(nohz.cpu_mask);
 
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
@@ -4031,7 +4019,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
         * cpus with ticks stopped, is it time for that to stop?
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                resched_cpu(cpu);
                return;
        }
@@ -4041,7 +4029,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
         * someone else, then no need raise the SCHED_SOFTIRQ
         */
        if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpu_isset(cpu, nohz.cpu_mask))
+           cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
 #endif
        if (time_after_eq(jiffies, rq->next_balance))
@@ -4203,7 +4191,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
 
        if (p == rq->idle) {
                p->stime = cputime_add(p->stime, steal);
-               account_group_system_time(p, steal);
                if (atomic_read(&rq->nr_iowait) > 0)
                        cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
                else
@@ -4339,7 +4326,7 @@ void __kprobes sub_preempt_count(int val)
        /*
         * Underflow?
         */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                return;
        /*
         * Is the spinlock portion underflowing?
@@ -5400,10 +5387,9 @@ out_unlock:
        return retval;
 }
 
-long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
-       cpumask_t cpus_allowed;
-       cpumask_t new_mask = *in_mask;
+       cpumask_var_t cpus_allowed, new_mask;
        struct task_struct *p;
        int retval;
 
@@ -5425,6 +5411,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
        get_task_struct(p);
        read_unlock(&tasklist_lock);
 
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
        retval = -EPERM;
        if ((current->euid != p->euid) && (current->euid != p->uid) &&
                        !capable(CAP_SYS_NICE))
@@ -5434,37 +5428,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
        if (retval)
                goto out_unlock;
 
-       cpuset_cpus_allowed(p, &cpus_allowed);
-       cpus_and(new_mask, new_mask, cpus_allowed);
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
  again:
-       retval = set_cpus_allowed_ptr(p, &new_mask);
+       retval = set_cpus_allowed_ptr(p, new_mask);
 
        if (!retval) {
-               cpuset_cpus_allowed(p, &cpus_allowed);
-               if (!cpus_subset(new_mask, cpus_allowed)) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
                        /*
                         * We must have raced with a concurrent cpuset
                         * update. Just reset the cpus_allowed to the
                         * cpuset's cpus_allowed
                         */
-                       new_mask = cpus_allowed;
+                       cpumask_copy(new_mask, cpus_allowed);
                        goto again;
                }
        }
 out_unlock:
+       free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+out_put_task:
        put_task_struct(p);
        put_online_cpus();
        return retval;
 }
 
 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            cpumask_t *new_mask)
+                            struct cpumask *new_mask)
 {
-       if (len < sizeof(cpumask_t)) {
-               memset(new_mask, 0, sizeof(cpumask_t));
-       } else if (len > sizeof(cpumask_t)) {
-               len = sizeof(cpumask_t);
-       }
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
+
        return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
 }
 
@@ -5477,17 +5475,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                      unsigned long __user *user_mask_ptr)
 {
-       cpumask_t new_mask;
+       cpumask_var_t new_mask;
        int retval;
 
-       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-       if (retval)
-               return retval;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
 
-       return sched_setaffinity(pid, &new_mask);
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
 }
 
-long sched_getaffinity(pid_t pid, cpumask_t *mask)
+long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
        int retval;
@@ -5504,7 +5505,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
        if (retval)
                goto out_unlock;
 
-       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
 
 out_unlock:
        read_unlock(&tasklist_lock);
@@ -5523,19 +5524,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
                                      unsigned long __user *user_mask_ptr)
 {
        int ret;
-       cpumask_t mask;
+       cpumask_var_t mask;
 
-       if (len < sizeof(cpumask_t))
+       if (len < cpumask_size())
                return -EINVAL;
 
-       ret = sched_getaffinity(pid, &mask);
-       if (ret < 0)
-               return ret;
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
 
-       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-               return -EFAULT;
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                       ret = -EFAULT;
+               else
+                       ret = cpumask_size();
+       }
+       free_cpumask_var(mask);
 
-       return sizeof(cpumask_t);
+       return ret;
 }
 
 /**
@@ -5877,7 +5883,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        idle->se.exec_start = sched_clock();
 
        idle->prio = idle->normal_prio = MAX_PRIO;
-       idle->cpus_allowed = cpumask_of_cpu(cpu);
+       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
        __set_task_cpu(idle, cpu);
 
        rq->curr = rq->idle = idle;
@@ -5896,6 +5902,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
 }
 
 /*
@@ -5903,9 +5910,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
  * indicates which cpus entered this state. This is used
  * in the rcu update to wait only for active cpus. For system
  * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_MASK_NONE.
+ * always be CPU_BITS_NONE.
  */
-cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+cpumask_var_t nohz_cpu_mask;
 
 /*
  * Increase the granularity value when there are more CPUs,
@@ -5960,7 +5967,7 @@ static inline void sched_init_granularity(void)
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 {
        struct migration_req req;
        unsigned long flags;
@@ -5968,13 +5975,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
        int ret = 0;
 
        rq = task_rq_lock(p, &flags);
-       if (!cpus_intersects(*new_mask, cpu_online_map)) {
+       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                ret = -EINVAL;
                goto out;
        }
 
        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
                ret = -EINVAL;
                goto out;
        }
@@ -5982,15 +5989,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        else {
-               p->cpus_allowed = *new_mask;
-               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
        }
 
        /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpu_isset(task_cpu(p), *new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
 
-       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, &flags);
                wake_up_process(rq->migration_thread);
@@ -6032,7 +6039,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
-       if (!cpu_isset(dest_cpu, p->cpus_allowed))
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                goto fail;
 
        on_rq = p->se.on_rq;
@@ -6126,54 +6133,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
-       unsigned long flags;
-       cpumask_t mask;
-       struct rq *rq;
        int dest_cpu;
+       /* FIXME: Use cpumask_of_node here. */
+       cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+       const struct cpumask *nodemask = &_nodemask;
+
+again:
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       goto move;
+
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+       if (dest_cpu < nr_cpu_ids)
+               goto move;
+
+       /* No more Mr. Nice Guy. */
+       if (dest_cpu >= nr_cpu_ids) {
+               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
 
-       do {
-               /* On same node? */
-               mask = node_to_cpumask(cpu_to_node(dead_cpu));
-               cpus_and(mask, mask, p->cpus_allowed);
-               dest_cpu = any_online_cpu(mask);
-
-               /* On any allowed CPU? */
-               if (dest_cpu >= nr_cpu_ids)
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-
-               /* No more Mr. Nice Guy. */
-               if (dest_cpu >= nr_cpu_ids) {
-                       cpumask_t cpus_allowed;
-
-                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
-                       /*
-                        * Try to stay on the same cpuset, where the
-                        * current cpuset may be a subset of all cpus.
-                        * The cpuset_cpus_allowed_locked() variant of
-                        * cpuset_cpus_allowed() will not block. It must be
-                        * called within calls to cpuset_lock/cpuset_unlock.
-                        */
-                       rq = task_rq_lock(p, &flags);
-                       p->cpus_allowed = cpus_allowed;
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-                       task_rq_unlock(rq, &flags);
-
-                       /*
-                        * Don't tell them about moving exiting tasks or
-                        * kernel threads (both mm NULL), since they never
-                        * leave kernel.
-                        */
-                       if (p->mm && printk_ratelimit()) {
-                               printk(KERN_INFO "process %d (%s) no "
-                                      "longer affine to cpu%d\n",
-                                       task_pid_nr(p), p->comm, dead_cpu);
-                       }
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, dead_cpu);
                }
-       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+       }
+
+move:
+       /* It can have affinity changed while we were choosing. */
+       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+               goto again;
 }
 
 /*
@@ -6185,7 +6184,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  */
 static void migrate_nr_uninterruptible(struct rq *rq_src)
 {
-       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
        unsigned long flags;
 
        local_irq_save(flags);
@@ -6475,7 +6474,7 @@ static void set_rq_online(struct rq *rq)
        if (!rq->online) {
                const struct sched_class *class;
 
-               cpu_set(rq->cpu, rq->rd->online);
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
                rq->online = 1;
 
                for_each_class(class) {
@@ -6495,7 +6494,7 @@ static void set_rq_offline(struct rq *rq)
                                class->rq_offline(rq);
                }
 
-               cpu_clear(rq->cpu, rq->rd->online);
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
                rq->online = 0;
        }
 }
@@ -6536,7 +6535,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
                        set_rq_online(rq);
                }
@@ -6550,7 +6549,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                        break;
                /* Unbind it from offline cpu so it can run. Fall thru. */
                kthread_bind(cpu_rq(cpu)->migration_thread,
-                            any_online_cpu(cpu_online_map));
+                            cpumask_any(cpu_online_mask));
                kthread_stop(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
@@ -6600,7 +6599,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                        set_rq_offline(rq);
                }
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -6638,36 +6637,14 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-       switch (lvl) {
-       case SD_LV_NONE:
-                       return "NONE";
-       case SD_LV_SIBLING:
-                       return "SIBLING";
-       case SD_LV_MC:
-                       return "MC";
-       case SD_LV_CPU:
-                       return "CPU";
-       case SD_LV_NODE:
-                       return "NODE";
-       case SD_LV_ALLNODES:
-                       return "ALLNODES";
-       case SD_LV_MAX:
-                       return "MAX";
-
-       }
-       return "MAX";
-}
-
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 cpumask_t *groupmask)
+                                 struct cpumask *groupmask)
 {
        struct sched_group *group = sd->groups;
        char str[256];
 
-       cpulist_scnprintf(str, sizeof(str), sd->span);
-       cpus_clear(*groupmask);
+       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
 
        printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
 
@@ -6679,14 +6656,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
 
-       printk(KERN_CONT "span %s level %s\n",
-               str, sd_level_to_string(sd->level));
+       printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
-       if (!cpu_isset(cpu, sd->span)) {
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
                                "CPU%d\n", cpu);
        }
-       if (!cpu_isset(cpu, group->cpumask)) {
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                printk(KERN_ERR "ERROR: domain->groups does not contain"
                                " CPU%d\n", cpu);
        }
@@ -6706,31 +6682,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
 
-               if (!cpus_weight(group->cpumask)) {
+               if (!cpumask_weight(sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: empty group\n");
                        break;
                }
 
-               if (cpus_intersects(*groupmask, group->cpumask)) {
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
                }
 
-               cpus_or(*groupmask, *groupmask, group->cpumask);
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 
-               cpulist_scnprintf(str, sizeof(str), group->cpumask);
+               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
 
                group = group->next;
        } while (group != sd->groups);
        printk(KERN_CONT "\n");
 
-       if (!cpus_equal(sd->span, *groupmask))
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
                printk(KERN_ERR "ERROR: groups don't span domain->span\n");
 
-       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                printk(KERN_ERR "ERROR: parent span is not a superset "
                        "of domain->span\n");
        return 0;
@@ -6738,7 +6715,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
-       cpumask_t *groupmask;
+       cpumask_var_t groupmask;
        int level = 0;
 
        if (!sd) {
@@ -6748,8 +6725,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
 
-       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-       if (!groupmask) {
+       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                return;
        }
@@ -6762,7 +6738,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
                if (!sd)
                        break;
        }
-       kfree(groupmask);
+       free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6770,7 +6746,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
 static int sd_degenerate(struct sched_domain *sd)
 {
-       if (cpus_weight(sd->span) == 1)
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
                return 1;
 
        /* Following flags need at least 2 groups */
@@ -6801,7 +6777,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        if (sd_degenerate(parent))
                return 1;
 
-       if (!cpus_equal(sd->span, parent->span))
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                return 0;
 
        /* Does parent contain flags not in child? */
@@ -6816,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
        }
        if (~cflags & pflags)
                return 0;
@@ -6823,6 +6801,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        return 1;
 }
 
+static void free_rootdomain(struct root_domain *rd)
+{
+       cpupri_cleanup(&rd->cpupri);
+
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+}
+
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
        unsigned long flags;
@@ -6832,38 +6820,63 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        if (rq->rd) {
                struct root_domain *old_rd = rq->rd;
 
-               if (cpu_isset(rq->cpu, old_rd->online))
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
 
-               cpu_clear(rq->cpu, old_rd->span);
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
 
                if (atomic_dec_and_test(&old_rd->refcount))
-                       kfree(old_rd);
+                       free_rootdomain(old_rd);
        }
 
        atomic_inc(&rd->refcount);
        rq->rd = rd;
 
-       cpu_set(rq->cpu, rd->span);
-       if (cpu_isset(rq->cpu, cpu_online_map))
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                set_rq_online(rq);
 
        spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-static void init_rootdomain(struct root_domain *rd)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
        memset(rd, 0, sizeof(*rd));
 
-       cpus_clear(rd->span);
-       cpus_clear(rd->online);
+       if (bootmem) {
+               alloc_bootmem_cpumask_var(&def_root_domain.span);
+               alloc_bootmem_cpumask_var(&def_root_domain.online);
+               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+               cpupri_init(&rd->cpupri, true);
+               return 0;
+       }
+
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto free_rd;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
+
+       if (cpupri_init(&rd->cpupri, false) != 0)
+               goto free_rto_mask;
+       return 0;
 
-       cpupri_init(&rd->cpupri);
+free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+free_online:
+       free_cpumask_var(rd->online);
+free_span:
+       free_cpumask_var(rd->span);
+free_rd:
+       kfree(rd);
+       return -ENOMEM;
 }
 
 static void init_defrootdomain(void)
 {
-       init_rootdomain(&def_root_domain);
+       init_rootdomain(&def_root_domain, true);
+
        atomic_set(&def_root_domain.refcount, 1);
 }
 
@@ -6875,7 +6888,10 @@ static struct root_domain *alloc_rootdomain(void)
        if (!rd)
                return NULL;
 
-       init_rootdomain(rd);
+       if (init_rootdomain(rd, false) != 0) {
+               kfree(rd);
+               return NULL;
+       }
 
        return rd;
 }
@@ -6917,19 +6933,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 }
 
 /* cpus with isolated domains */
-static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_var_t cpu_isolated_map;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-       static int __initdata ints[NR_CPUS];
-       int i;
-
-       str = get_options(str, ARRAY_SIZE(ints), ints);
-       cpus_clear(cpu_isolated_map);
-       for (i = 1; i <= ints[0]; i++)
-               if (ints[i] < NR_CPUS)
-                       cpu_set(ints[i], cpu_isolated_map);
+       cpulist_parse(str, cpu_isolated_map);
        return 1;
 }
 
@@ -6938,42 +6947,43 @@ __setup("isolcpus=", isolated_cpu_setup);
 /*
  * init_sched_build_groups takes the cpumask we wish to span, and a pointer
  * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- * (due to the fact that we keep track of groups covered with a cpumask_t).
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
  *
  * init_sched_build_groups will build a circular linked list of the groups
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
 static void
-init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+init_sched_build_groups(const struct cpumask *span,
+                       const struct cpumask *cpu_map,
+                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                        struct sched_group **sg,
-                                       cpumask_t *tmpmask),
-                       cpumask_t *covered, cpumask_t *tmpmask)
+                                       struct cpumask *tmpmask),
+                       struct cpumask *covered, struct cpumask *tmpmask)
 {
        struct sched_group *first = NULL, *last = NULL;
        int i;
 
-       cpus_clear(*covered);
+       cpumask_clear(covered);
 
-       for_each_cpu_mask_nr(i, *span) {
+       for_each_cpu(i, span) {
                struct sched_group *sg;
                int group = group_fn(i, cpu_map, &sg, tmpmask);
                int j;
 
-               if (cpu_isset(i, *covered))
+               if (cpumask_test_cpu(i, covered))
                        continue;
 
-               cpus_clear(sg->cpumask);
+               cpumask_clear(sched_group_cpus(sg));
                sg->__cpu_power = 0;
 
-               for_each_cpu_mask_nr(j, *span) {
+               for_each_cpu(j, span) {
                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                continue;
 
-                       cpu_set(j, *covered);
-                       cpu_set(j, sg->cpumask);
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                }
                if (!first)
                        first = sg;
@@ -7037,9 +7047,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static void sched_domain_node_span(int node, cpumask_t *span)
+static void sched_domain_node_span(int node, struct cpumask *span)
 {
        nodemask_t used_nodes;
+       /* FIXME: use cpumask_of_node() */
        node_to_cpumask_ptr(nodemask, node);
        int i;
 
@@ -7060,19 +7071,34 @@ static void sched_domain_node_span(int node, cpumask_t *span)
 
 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
+/*
+ * The cpus mask in sched_group and sched_domain hangs off the end.
+ * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ * for nr_cpu_ids < CONFIG_NR_CPUS.
+ */
+struct static_sched_group {
+       struct sched_group sg;
+       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+};
+
+struct static_sched_domain {
+       struct sched_domain sd;
+       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+};
+
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
 
 static int
-cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                cpumask_t *unused)
+cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+                struct sched_group **sg, struct cpumask *unused)
 {
        if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu);
+               *sg = &per_cpu(sched_group_cpus, cpu).sg;
        return cpu;
 }
 #endif /* CONFIG_SCHED_SMT */
@@ -7081,56 +7107,55 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
 #endif /* CONFIG_SCHED_MC */
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
 {
        int group;
 
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
        if (sg)
-               *sg = &per_cpu(sched_group_core, group);
+               *sg = &per_cpu(sched_group_core, group).sg;
        return group;
 }
 #elif defined(CONFIG_SCHED_MC)
 static int
-cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *unused)
+cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *unused)
 {
        if (sg)
-               *sg = &per_cpu(sched_group_core, cpu);
+               *sg = &per_cpu(sched_group_core, cpu).sg;
        return cpu;
 }
 #endif
 
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
 
 static int
-cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
 {
        int group;
 #ifdef CONFIG_SCHED_MC
+       /* FIXME: Use cpu_coregroup_mask. */
        *mask = cpu_coregroup_map(cpu);
        cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
 #else
        group = cpu;
 #endif
        if (sg)
-               *sg = &per_cpu(sched_group_phys, group);
+               *sg = &per_cpu(sched_group_phys, group).sg;
        return group;
 }
 
@@ -7144,19 +7169,21 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
 
-static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-                                struct sched_group **sg, cpumask_t *nodemask)
+static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                                struct sched_group **sg,
+                                struct cpumask *nodemask)
 {
        int group;
+       /* FIXME: use cpumask_of_node */
+       node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
 
-       *nodemask = node_to_cpumask(cpu_to_node(cpu));
-       cpus_and(*nodemask, *nodemask, *cpu_map);
-       group = first_cpu(*nodemask);
+       cpumask_and(nodemask, pnodemask, cpu_map);
+       group = cpumask_first(nodemask);
 
        if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group);
+               *sg = &per_cpu(sched_group_allnodes, group).sg;
        return group;
 }
 
@@ -7168,11 +7195,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
        if (!sg)
                return;
        do {
-               for_each_cpu_mask_nr(j, sg->cpumask) {
+               for_each_cpu(j, sched_group_cpus(sg)) {
                        struct sched_domain *sd;
 
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
+                       sd = &per_cpu(phys_domains, j).sd;
+                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
@@ -7189,11 +7216,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 
 #ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
 {
        int cpu, i;
 
-       for_each_cpu_mask_nr(cpu, *cpu_map) {
+       for_each_cpu(cpu, cpu_map) {
                struct sched_group **sched_group_nodes
                        = sched_group_nodes_bycpu[cpu];
 
@@ -7202,10 +7230,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
 
                for (i = 0; i < nr_node_ids; i++) {
                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                       /* FIXME: Use cpumask_of_node */
+                       node_to_cpumask_ptr(pnodemask, i);
 
-                       *nodemask = node_to_cpumask(i);
-                       cpus_and(*nodemask, *nodemask, *cpu_map);
-                       if (cpus_empty(*nodemask))
+                       cpus_and(*nodemask, *pnodemask, *cpu_map);
+                       if (cpumask_empty(nodemask))
                                continue;
 
                        if (sg == NULL)
@@ -7223,7 +7252,8 @@ next_sg:
        }
 }
 #else /* !CONFIG_NUMA */
-static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
 {
 }
 #endif /* CONFIG_NUMA */
@@ -7249,7 +7279,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
        WARN_ON(!sd || !sd->groups);
 
-       if (cpu != first_cpu(sd->groups->cpumask))
+       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                return;
 
        child = sd->child;
@@ -7314,40 +7344,6 @@ SD_INIT_FUNC(CPU)
  SD_INIT_FUNC(MC)
 #endif
 
-/*
- * To minimize stack usage kmalloc room for cpumasks and share the
- * space as the usage in build_sched_domains() dictates.  Used only
- * if the amount of space is significant.
- */
-struct allmasks {
-       cpumask_t tmpmask;                      /* make this one first */
-       union {
-               cpumask_t nodemask;
-               cpumask_t this_sibling_map;
-               cpumask_t this_core_map;
-       };
-       cpumask_t send_covered;
-
-#ifdef CONFIG_NUMA
-       cpumask_t domainspan;
-       cpumask_t covered;
-       cpumask_t notcovered;
-#endif
-};
-
-#if    NR_CPUS > 128
-#define        SCHED_CPUMASK_ALLOC             1
-#define        SCHED_CPUMASK_FREE(v)           kfree(v)
-#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
-#else
-#define        SCHED_CPUMASK_ALLOC             0
-#define        SCHED_CPUMASK_FREE(v)
-#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
-#endif
-
-#define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
-                       ((unsigned long)(a) + offsetof(struct allmasks, v))
-
 static int default_relax_domain_level = -1;
 
 static int __init setup_relax_domain_level(char *str)
@@ -7387,17 +7383,38 @@ static void set_domain_attribute(struct sched_domain *sd,
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static int __build_sched_domains(const cpumask_t *cpu_map,
+static int __build_sched_domains(const struct cpumask *cpu_map,
                                 struct sched_domain_attr *attr)
 {
-       int i;
+       int i, err = -ENOMEM;
        struct root_domain *rd;
-       SCHED_CPUMASK_DECLARE(allmasks);
-       cpumask_t *tmpmask;
+       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+               tmpmask;
 #ifdef CONFIG_NUMA
+       cpumask_var_t domainspan, covered, notcovered;
        struct sched_group **sched_group_nodes = NULL;
        int sd_allnodes = 0;
 
+       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+               goto free_domainspan;
+       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+               goto free_covered;
+#endif
+
+       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+               goto free_notcovered;
+       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+               goto free_nodemask;
+       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+               goto free_this_sibling_map;
+       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+               goto free_this_core_map;
+       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               goto free_send_covered;
+
+#ifdef CONFIG_NUMA
        /*
         * Allocate the per-node list of sched groups
         */
@@ -7405,55 +7422,37 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                                    GFP_KERNEL);
        if (!sched_group_nodes) {
                printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return -ENOMEM;
+               goto free_tmpmask;
        }
 #endif
 
        rd = alloc_rootdomain();
        if (!rd) {
                printk(KERN_WARNING "Cannot alloc root domain\n");
-#ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
-#endif
-               return -ENOMEM;
+               goto free_sched_groups;
        }
 
-#if SCHED_CPUMASK_ALLOC
-       /* get space for all scratch cpumask variables */
-       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
-       if (!allmasks) {
-               printk(KERN_WARNING "Cannot alloc cpumask array\n");
-               kfree(rd);
 #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
-#endif
-               return -ENOMEM;
-       }
-#endif
-       tmpmask = (cpumask_t *)allmasks;
-
-
-#ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
 #endif
 
        /*
         * Set up domains for cpus specified by the cpu_map.
         */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd = NULL, *p;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
 
+               /* FIXME: use cpumask_of_node */
                *nodemask = node_to_cpumask(cpu_to_node(i));
                cpus_and(*nodemask, *nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-               if (cpus_weight(*cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+               if (cpumask_weight(cpu_map) >
+                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
                        sd = &per_cpu(allnodes_domains, i);
                        SD_INIT(sd, ALLNODES);
                        set_domain_attribute(sd, attr);
-                       sd->span = *cpu_map;
+                       cpumask_copy(sched_domain_span(sd), cpu_map);
                        cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                        p = sd;
                        sd_allnodes = 1;
@@ -7463,18 +7462,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                sd = &per_cpu(node_domains, i);
                SD_INIT(sd, NODE);
                set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                sd->parent = p;
                if (p)
                        p->child = sd;
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
 #endif
 
                p = sd;
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
                SD_INIT(sd, CPU);
                set_domain_attribute(sd, attr);
-               sd->span = *nodemask;
+               cpumask_copy(sched_domain_span(sd), nodemask);
                sd->parent = p;
                if (p)
                        p->child = sd;
@@ -7482,11 +7482,12 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
                p = sd;
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
-               sd->span = cpu_coregroup_map(i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               *sched_domain_span(sd) = cpu_coregroup_map(i);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7494,11 +7495,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
                p = sd;
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
                SD_INIT(sd, SIBLING);
                set_domain_attribute(sd, attr);
-               sd->span = per_cpu(cpu_sibling_map, i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7507,13 +7508,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
-               *this_sibling_map = per_cpu(cpu_sibling_map, i);
-               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-               if (i != first_cpu(*this_sibling_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_sibling_map,
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
+               if (i != cpumask_first(this_sibling_map))
                        continue;
 
                init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7524,13 +7522,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
 #ifdef CONFIG_SCHED_MC
        /* Set up multi-core groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_core_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
+       for_each_cpu(i, cpu_map) {
+               /* FIXME: Use cpu_coregroup_mask */
                *this_core_map = cpu_coregroup_map(i);
                cpus_and(*this_core_map, *this_core_map, *cpu_map);
-               if (i != first_cpu(*this_core_map))
+               if (i != cpumask_first(this_core_map))
                        continue;
 
                init_sched_build_groups(this_core_map, cpu_map,
@@ -7541,12 +7537,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
        /* Set up physical groups */
        for (i = 0; i < nr_node_ids; i++) {
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
+               /* FIXME: Use cpumask_of_node */
                *nodemask = node_to_cpumask(i);
                cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask))
+               if (cpumask_empty(nodemask))
                        continue;
 
                init_sched_build_groups(nodemask, cpu_map,
@@ -7557,8 +7551,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #ifdef CONFIG_NUMA
        /* Set up node groups */
        if (sd_allnodes) {
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
-
                init_sched_build_groups(cpu_map, cpu_map,
                                        &cpu_to_allnodes_group,
                                        send_covered, tmpmask);
@@ -7567,58 +7559,58 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        for (i = 0; i < nr_node_ids; i++) {
                /* Set up node groups */
                struct sched_group *sg, *prev;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(domainspan, allmasks);
-               SCHED_CPUMASK_VAR(covered, allmasks);
                int j;
 
+               /* FIXME: Use cpumask_of_node */
                *nodemask = node_to_cpumask(i);
-               cpus_clear(*covered);
+               cpumask_clear(covered);
 
                cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask)) {
+               if (cpumask_empty(nodemask)) {
                        sched_group_nodes[i] = NULL;
                        continue;
                }
 
                sched_domain_node_span(i, domainspan);
-               cpus_and(*domainspan, *domainspan, *cpu_map);
+               cpumask_and(domainspan, domainspan, cpu_map);
 
-               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, i);
                if (!sg) {
                        printk(KERN_WARNING "Can not alloc domain group for "
                                "node %d\n", i);
                        goto error;
                }
                sched_group_nodes[i] = sg;
-               for_each_cpu_mask_nr(j, *nodemask) {
+               for_each_cpu(j, nodemask) {
                        struct sched_domain *sd;
 
                        sd = &per_cpu(node_domains, j);
                        sd->groups = sg;
                }
                sg->__cpu_power = 0;
-               sg->cpumask = *nodemask;
+               cpumask_copy(sched_group_cpus(sg), nodemask);
                sg->next = sg;
-               cpus_or(*covered, *covered, *nodemask);
+               cpumask_or(covered, covered, nodemask);
                prev = sg;
 
                for (j = 0; j < nr_node_ids; j++) {
-                       SCHED_CPUMASK_VAR(notcovered, allmasks);
                        int n = (i + j) % nr_node_ids;
+                       /* FIXME: Use cpumask_of_node */
                        node_to_cpumask_ptr(pnodemask, n);
 
-                       cpus_complement(*notcovered, *covered);
-                       cpus_and(*tmpmask, *notcovered, *cpu_map);
-                       cpus_and(*tmpmask, *tmpmask, *domainspan);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_complement(notcovered, covered);
+                       cpumask_and(tmpmask, notcovered, cpu_map);
+                       cpumask_and(tmpmask, tmpmask, domainspan);
+                       if (cpumask_empty(tmpmask))
                                break;
 
-                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_and(tmpmask, tmpmask, pnodemask);
+                       if (cpumask_empty(tmpmask))
                                continue;
 
-                       sg = kmalloc_node(sizeof(struct sched_group),
+                       sg = kmalloc_node(sizeof(struct sched_group) +
+                                         cpumask_size(),
                                          GFP_KERNEL, i);
                        if (!sg) {
                                printk(KERN_WARNING
@@ -7626,9 +7618,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                                goto error;
                        }
                        sg->__cpu_power = 0;
-                       sg->cpumask = *tmpmask;
+                       cpumask_copy(sched_group_cpus(sg), tmpmask);
                        sg->next = prev->next;
-                       cpus_or(*covered, *covered, *tmpmask);
+                       cpumask_or(covered, covered, tmpmask);
                        prev->next = sg;
                        prev = sg;
                }
@@ -7637,22 +7629,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
 
                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
 
                init_sched_groups_power(i, sd);
        }
 #endif
 
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
 
                init_sched_groups_power(i, sd);
        }
@@ -7664,56 +7656,87 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
        if (sd_allnodes) {
                struct sched_group *sg;
 
-               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                tmpmask);
                init_numa_sched_groups_power(sg);
        }
 #endif
 
        /* Attach the domains */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
 #elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
 #else
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
 #endif
                cpu_attach_domain(sd, rd, i);
        }
 
-       SCHED_CPUMASK_FREE((void *)allmasks);
-       return 0;
+       err = 0;
+
+free_tmpmask:
+       free_cpumask_var(tmpmask);
+free_send_covered:
+       free_cpumask_var(send_covered);
+free_this_core_map:
+       free_cpumask_var(this_core_map);
+free_this_sibling_map:
+       free_cpumask_var(this_sibling_map);
+free_nodemask:
+       free_cpumask_var(nodemask);
+free_notcovered:
+#ifdef CONFIG_NUMA
+       free_cpumask_var(notcovered);
+free_covered:
+       free_cpumask_var(covered);
+free_domainspan:
+       free_cpumask_var(domainspan);
+out:
+#endif
+       return err;
+
+free_sched_groups:
+#ifdef CONFIG_NUMA
+       kfree(sched_group_nodes);
+#endif
+       goto free_tmpmask;
 
 #ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map, tmpmask);
-       SCHED_CPUMASK_FREE((void *)allmasks);
-       kfree(rd);
-       return -ENOMEM;
+       free_rootdomain(rd);
+       goto free_tmpmask;
 #endif
 }
 
-static int build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const struct cpumask *cpu_map)
 {
        return __build_sched_domains(cpu_map, NULL);
 }
 
-static cpumask_t *doms_cur;    /* current sched domains */
+static struct cpumask *doms_cur;       /* current sched domains */
 static int ndoms_cur;          /* number of sched domains in 'doms_cur' */
 static struct sched_domain_attr *dattr_cur;
                                /* attribues of custom domains in 'doms_cur' */
 
 /*
  * Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask_t) fails, then fallback to a single sched domain,
- * as determined by the single cpumask_t fallback_doms.
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
  */
-static cpumask_t fallback_doms;
+static cpumask_var_t fallback_doms;
 
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
 {
+       return 0;
 }
 
 /*
@@ -7721,16 +7744,16 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
  * For now this just excludes isolated cpus, but could be used to
  * exclude other special cases in the future.
  */
-static int arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const struct cpumask *cpu_map)
 {
        int err;
 
        arch_update_cpu_topology();
        ndoms_cur = 1;
-       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+       doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
        if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+               doms_cur = fallback_doms;
+       cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
        err = build_sched_domains(doms_cur);
        register_sched_domain_sysctl();
@@ -7738,8 +7761,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
        return err;
 }
 
-static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-                                      cpumask_t *tmpmask)
+static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+                                      struct cpumask *tmpmask)
 {
        free_sched_groups(cpu_map, tmpmask);
 }
@@ -7748,17 +7771,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
  * Detach sched domains from a group of cpus specified in cpu_map
  * These cpus will now be attached to the NULL domain
  */
-static void detach_destroy_domains(const cpumask_t *cpu_map)
+static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-       cpumask_t tmpmask;
+       /* Save because hotplug lock held. */
+       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
 
-       unregister_sched_domain_sysctl();
-
-       for_each_cpu_mask_nr(i, *cpu_map)
+       for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, &tmpmask);
+       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 
 /* handle null as "default" */
@@ -7783,7 +7805,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * doms_new[] to the current sched domain partitioning, doms_cur[].
  * It destroys each deleted domain and builds each new domain.
  *
- * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
  * The masks don't intersect (don't overlap.) We should setup one
  * sched domain for each mask. CPUs not in any of the cpumasks will
  * not be load balanced. If the same cpumask appears both in the
@@ -7797,28 +7819,33 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
  * the single partition 'fallback_doms', it also forces the domains
  * to be rebuilt.
  *
- * If doms_new == NULL it will be replaced with cpu_online_map.
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
  * ndoms_new == 0 is a special case for destroying existing domains,
  * and it will not create the default domain.
  *
  * Call with hotplug lock held
  */
-void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+/* FIXME: Change to struct cpumask *doms_new[] */
+void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                             struct sched_domain_attr *dattr_new)
 {
        int i, j, n;
+       int new_topology;
 
        mutex_lock(&sched_domains_mutex);
 
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
 
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
+
        n = doms_new ? ndoms_new : 0;
 
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j])
+               for (j = 0; j < n && !new_topology; j++) {
+                       if (cpumask_equal(&doms_cur[i], &doms_new[j])
                            && dattrs_equal(dattr_cur, i, dattr_new, j))
                                goto match1;
                }
@@ -7830,15 +7857,15 @@ match1:
 
        if (doms_new == NULL) {
                ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
+               doms_new = fallback_doms;
+               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
+               WARN_ON_ONCE(dattr_new);
        }
 
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j])
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
+                       if (cpumask_equal(&doms_new[i], &doms_cur[j])
                            && dattrs_equal(dattr_new, i, dattr_cur, j))
                                goto match2;
                }
@@ -7850,7 +7877,7 @@ match2:
        }
 
        /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
+       if (doms_cur != fallback_doms)
                kfree(doms_cur);
        kfree(dattr_cur);       /* kfree(NULL) is safe */
        doms_cur = doms_new;
@@ -7990,7 +8017,9 @@ static int update_runtime(struct notifier_block *nfb,
 
 void __init sched_init_smp(void)
 {
-       cpumask_t non_isolated_cpus;
+       cpumask_var_t non_isolated_cpus;
+
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
 
 #if defined(CONFIG_NUMA)
        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7999,10 +8028,10 @@ void __init sched_init_smp(void)
 #endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-       if (cpus_empty(non_isolated_cpus))
-               cpu_set(smp_processor_id(), non_isolated_cpus);
+       arch_init_sched_domains(cpu_online_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
 
@@ -8017,9 +8046,13 @@ void __init sched_init_smp(void)
        init_hrtick();
 
        /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                BUG();
        sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+       init_sched_rt_class();
 }
 #else
 void __init sched_init_smp(void)
@@ -8334,6 +8367,15 @@ void __init sched_init(void)
         */
        current->sched_class = &fair_sched_class;
 
+       /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ
+       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+#endif
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+#endif /* SMP */
+
        scheduler_running = 1;
 }
 
@@ -8492,7 +8534,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
-       struct sched_entity *se, *parent_se;
+       struct sched_entity *se;
        struct rq *rq;
        int i;
 
@@ -8508,18 +8550,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
 
-               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
 
-               se = kmalloc_node(sizeof(struct sched_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               se = kzalloc_node(sizeof(struct sched_entity),
+                                 GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
 
-               parent_se = parent ? parent->se[i] : NULL;
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
 
        return 1;
@@ -8580,7 +8621,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se, *parent_se;
+       struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
 
@@ -8597,18 +8638,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
 
-               rt_rq = kmalloc_node(sizeof(struct rt_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
 
-               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
 
-               parent_se = parent ? parent->rt_se[i] : NULL;
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
 
        return 1;
@@ -9251,11 +9291,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  * (balbir@in.ibm.com).
  */
 
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+       struct cpuacct *parent;
 };
 
 struct cgroup_subsys cpuacct_subsys;
@@ -9289,6 +9330,9 @@ static struct cgroup_subsys_state *cpuacct_create(
                return ERR_PTR(-ENOMEM);
        }
 
+       if (cgrp->parent)
+               ca->parent = cgroup_ca(cgrp->parent);
+
        return &ca->css;
 }
 
@@ -9368,14 +9412,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
+       int cpu;
 
        if (!cpuacct_subsys.active)
                return;
 
+       cpu = task_cpu(tsk);
        ca = task_ca(tsk);
-       if (ca) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 
+       for (; ca; ca = ca->parent) {
+               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }
index 52154fefab7eef9c8302a7da96741bf59d13c6b6..018b7be1db2e57a219066def4b6efb48043605b9 100644 (file)
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
  * Returns: (int)bool - CPUs were found
  */
 int cpupri_find(struct cpupri *cp, struct task_struct *p,
-               cpumask_t *lowest_mask)
+               struct cpumask *lowest_mask)
 {
        int                  idx      = 0;
        int                  task_pri = convert_prio(p->prio);
 
        for_each_cpupri_active(cp->pri_active, idx) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
-               cpumask_t mask;
 
                if (idx >= task_pri)
                        break;
 
-               cpus_and(mask, p->cpus_allowed, vec->mask);
-
-               if (cpus_empty(mask))
+               if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
 
-               *lowest_mask = mask;
+               cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
                return 1;
        }
 
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
                vec->count--;
                if (!vec->count)
                        clear_bit(oldpri, cp->pri_active);
-               cpu_clear(cpu, vec->mask);
+               cpumask_clear_cpu(cpu, vec->mask);
 
                spin_unlock_irqrestore(&vec->lock, flags);
        }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 
                spin_lock_irqsave(&vec->lock, flags);
 
-               cpu_set(cpu, vec->mask);
+               cpumask_set_cpu(cpu, vec->mask);
                vec->count++;
                if (vec->count == 1)
                        set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 /**
  * cpupri_init - initialize the cpupri structure
  * @cp: The cpupri context
+ * @bootmem: true if allocations need to use bootmem
  *
- * Returns: (void)
+ * Returns: -ENOMEM if memory fails.
  */
-void cpupri_init(struct cpupri *cp)
+int cpupri_init(struct cpupri *cp, bool bootmem)
 {
        int i;
 
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
 
                spin_lock_init(&vec->lock);
                vec->count = 0;
-               cpus_clear(vec->mask);
+               if (bootmem)
+                       alloc_bootmem_cpumask_var(&vec->mask);
+               else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+                       goto cleanup;
        }
 
        for_each_possible_cpu(i)
                cp->cpu_to_pri[i] = CPUPRI_INVALID;
+       return 0;
+
+cleanup:
+       for (i--; i >= 0; i--)
+               free_cpumask_var(cp->pri_to_cpu[i].mask);
+       return -ENOMEM;
 }
 
+/**
+ * cpupri_cleanup - clean up the cpupri structure
+ * @cp: The cpupri context
+ */
+void cpupri_cleanup(struct cpupri *cp)
+{
+       int i;
 
+       for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
+               free_cpumask_var(cp->pri_to_cpu[i].mask);
+}
index f25811b0f931fffb7276a10562cb1e366f12dfdb..642a94ef8a0a7fb39f9b341eb292dfad9ab38762 100644 (file)
@@ -14,7 +14,7 @@
 struct cpupri_vec {
        spinlock_t lock;
        int        count;
-       cpumask_ mask;
+       cpumask_var_t mask;
 };
 
 struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
 int  cpupri_find(struct cpupri *cp,
                 struct task_struct *p, cpumask_t *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
-void cpupri_init(struct cpupri *cp);
+int cpupri_init(struct cpupri *cp, bool bootmem);
+void cpupri_cleanup(struct cpupri *cp);
 #else
 #define cpupri_set(cp, cpu, pri) do { } while (0)
 #define cpupri_init() do { } while (0)
index 26ed8e3d1c1536a542b96b9262aa6dada54d9985..4293cfa9681d743f67a08298c5e3fccf7ae7b496 100644 (file)
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
 
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void print_cfs_group_stats(struct seq_file *m, int cpu,
+               struct task_group *tg)
+{
+       struct sched_entity *se = tg->se[cpu];
+       if (!se)
+               return;
+
+#define P(F) \
+       SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define PN(F) \
+       SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+
+       PN(se->exec_start);
+       PN(se->vruntime);
+       PN(se->sum_exec_runtime);
+#ifdef CONFIG_SCHEDSTATS
+       PN(se->wait_start);
+       PN(se->sleep_start);
+       PN(se->block_start);
+       PN(se->sleep_max);
+       PN(se->block_max);
+       PN(se->exec_max);
+       PN(se->slice_max);
+       PN(se->wait_max);
+       PN(se->wait_sum);
+       P(se->wait_count);
+#endif
+       P(se->load.weight);
+#undef PN
+#undef P
+}
+#endif
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
        char path[128] = "";
-       struct cgroup *cgroup = NULL;
        struct task_group *tg = cfs_rq->tg;
 
-       if (tg)
-               cgroup = tg->css.cgroup;
-
-       if (cgroup)
-               cgroup_path(cgroup, path, sizeof(path));
+       cgroup_path(tg->css.cgroup, path, sizeof(path));
 
        SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
+#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
+       {
+               uid_t uid = cfs_rq->tg->uid;
+               SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
+       }
 #else
        SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
-
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
                        SPLIT_NS(cfs_rq->exec_clock));
 
@@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #ifdef CONFIG_SMP
        SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
+       print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
 
@@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
        char path[128] = "";
-       struct cgroup *cgroup = NULL;
        struct task_group *tg = rt_rq->tg;
 
-       if (tg)
-               cgroup = tg->css.cgroup;
-
-       if (cgroup)
-               cgroup_path(cgroup, path, sizeof(path));
+       cgroup_path(tg->css.cgroup, path, sizeof(path));
 
        SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
@@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
        u64 now = ktime_to_ns(ktime_get());
        int cpu;
 
-       SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+       SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
index 98345e45b059ac665e13ce534a6fbb8a3460c5cf..08ffffd4a410cde090f50dac7a60a6e839666e5c 100644 (file)
@@ -1017,14 +1017,13 @@ static void yield_task_fair(struct rq *rq)
  * search starts with cpus closest then further out as needed,
  * so we always favor a closer, idle cpu.
  * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_map)
+ * hence we need to mask them out (cpu_active_mask)
  *
  * Returns the CPU we should wake onto.
  */
 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
 static int wake_idle(int cpu, struct task_struct *p)
 {
-       cpumask_t tmp;
        struct sched_domain *sd;
        int i;
 
@@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p)
                if ((sd->flags & SD_WAKE_IDLE)
                    || ((sd->flags & SD_WAKE_IDLE_FAR)
                        && !task_hot(p, task_rq(p)->clock, sd))) {
-                       cpus_and(tmp, sd->span, p->cpus_allowed);
-                       cpus_and(tmp, tmp, cpu_active_map);
-                       for_each_cpu_mask_nr(i, tmp) {
-                               if (idle_cpu(i)) {
+                       for_each_cpu_and(i, sched_domain_span(sd),
+                                        &p->cpus_allowed) {
+                               if (cpu_active(i) && idle_cpu(i)) {
                                        if (i != task_cpu(p)) {
                                                schedstat_inc(p,
                                                       se.nr_wakeups_idle);
@@ -1240,13 +1238,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
         * this_cpu and prev_cpu are present in:
         */
        for_each_domain(this_cpu, sd) {
-               if (cpu_isset(prev_cpu, sd->span)) {
+               if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
                        this_sd = sd;
                        break;
                }
        }
 
-       if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+       if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
                goto out;
 
        /*
index d9ba9d5f99d6af74b7c1db5ff5814c15900b68b0..1bbd99014011894766334eee96da13d6fbc4a8de 100644 (file)
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
        if (!rq->online)
                return;
 
-       cpu_set(rq->cpu, rq->rd->rto_mask);
+       cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
        /*
         * Make sure the mask is visible before we set
         * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
 
        /* the order here really doesn't matter */
        atomic_dec(&rq->rd->rto_count);
-       cpu_clear(rq->cpu, rq->rd->rto_mask);
+       cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
 static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
 }
 
 #ifdef CONFIG_SMP
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
        return cpu_rq(smp_processor_id())->rd->span;
 }
 #else
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 #endif
 
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
        return rt_rq->rt_throttled;
 }
 
-static inline cpumask_t sched_rt_period_mask(void)
+static inline const struct cpumask *sched_rt_period_mask(void)
 {
-       return cpu_online_map;
+       return cpu_online_mask;
 }
 
 static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
        int i, weight, more = 0;
        u64 rt_period;
 
-       weight = cpus_weight(rd->span);
+       weight = cpumask_weight(rd->span);
 
        spin_lock(&rt_b->rt_runtime_lock);
        rt_period = ktime_to_ns(rt_b->rt_period);
-       for_each_cpu_mask_nr(i, rd->span) {
+       for_each_cpu(i, rd->span) {
                struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                s64 diff;
 
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
                /*
                 * Greedy reclaim, take back as much as we can.
                 */
-               for_each_cpu_mask(i, rd->span) {
+               for_each_cpu(i, rd->span) {
                        struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
                        s64 diff;
 
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 {
        int i, idle = 1;
-       cpumask_t span;
+       const struct cpumask *span;
 
        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return 1;
 
        span = sched_rt_period_mask();
-       for_each_cpu_mask(i, span) {
+       for_each_cpu(i, span) {
                int enqueue = 0;
                struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
                struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
        for_each_sched_rt_entity(rt_se) {
                rt_rq = rt_rq_of_se(rt_se);
 
-               spin_lock(&rt_rq->rt_runtime_lock);
                if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+                       spin_lock(&rt_rq->rt_runtime_lock);
                        rt_rq->rt_time += delta_exec;
                        if (sched_rt_runtime_exceeded(rt_rq))
                                resched_task(curr);
+                       spin_unlock(&rt_rq->rt_runtime_lock);
                }
-               spin_unlock(&rt_rq->rt_runtime_lock);
        }
 }
 
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-       cpumask_t mask;
+       cpumask_var_t mask;
 
        if (rq->curr->rt.nr_cpus_allowed == 1)
                return;
 
-       if (p->rt.nr_cpus_allowed != 1
-           && cpupri_find(&rq->rd->cpupri, p, &mask))
+       if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
                return;
 
-       if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-               return;
+       if (p->rt.nr_cpus_allowed != 1
+           && cpupri_find(&rq->rd->cpupri, p, mask))
+               goto free;
+
+       if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+               goto free;
 
        /*
         * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
         */
        requeue_task_rt(rq, p, 1);
        resched_task(rq->curr);
+free:
+       free_cpumask_var(mask);
 }
 
 #endif /* CONFIG_SMP */
@@ -909,15 +914,12 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
-
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-           (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+           (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
            (p->rt.nr_cpus_allowed > 1))
                return 1;
        return 0;
@@ -956,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
        return next;
 }
 
-static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
@@ -976,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 static int find_lowest_rq(struct task_struct *task)
 {
        struct sched_domain *sd;
-       cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+       struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
 
@@ -991,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
         * I guess we might want to change cpupri_find() to ignore those
         * in the first place.
         */
-       cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+       cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
 
        /*
         * At this point we have built a mask of cpus representing the
@@ -1001,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
         * We prioritize the last cpu that the task executed on since
         * it is most likely cache-hot in that location.
         */
-       if (cpu_isset(cpu, *lowest_mask))
+       if (cpumask_test_cpu(cpu, lowest_mask))
                return cpu;
 
        /*
@@ -1016,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
                        cpumask_t domain_mask;
                        int       best_cpu;
 
-                       cpus_and(domain_mask, sd->span, *lowest_mask);
+                       cpumask_and(&domain_mask, sched_domain_span(sd),
+                                   lowest_mask);
 
                        best_cpu = pick_optimal_cpu(this_cpu,
                                                    &domain_mask);
@@ -1057,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                         * Also make sure that it wasn't scheduled on its rq.
                         */
                        if (unlikely(task_rq(task) != rq ||
-                                    !cpu_isset(lowest_rq->cpu,
-                                               task->cpus_allowed) ||
+                                    !cpumask_test_cpu(lowest_rq->cpu,
+                                                      &task->cpus_allowed) ||
                                     task_running(rq, task) ||
                                     !task->se.on_rq)) {
 
@@ -1179,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
 
        next = pick_next_task_rt(this_rq);
 
-       for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
+       for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
 
@@ -1308,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 
 static void set_cpus_allowed_rt(struct task_struct *p,
-                               const cpumask_t *new_mask)
+                               const struct cpumask *new_mask)
 {
-       int weight = cpus_weight(*new_mask);
+       int weight = cpumask_weight(new_mask);
 
        BUG_ON(!rt_task(p));
 
@@ -1331,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
                update_rt_migration(rq);
        }
 
-       p->cpus_allowed    = *new_mask;
+       cpumask_copy(&p->cpus_allowed, new_mask);
        p->rt.nr_cpus_allowed = weight;
 }
 
@@ -1374,6 +1377,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
        if (!rq->rt.rt_nr_running)
                pull_rt_task(rq);
 }
+
+static inline void init_sched_rt_class(void)
+{
+       unsigned int i;
+
+       for_each_possible_cpu(i)
+               alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+}
 #endif /* CONFIG_SMP */
 
 /*
@@ -1544,3 +1555,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
        rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
+
index 7dbf72a2b02cf16f256af7655eb9b28c7f3ef856..5fcf0e18458670e8e56a259cefb096ac38a7c95c 100644 (file)
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
                for_each_domain(cpu, sd) {
                        enum cpu_idle_type itype;
 
-                       cpumask_scnprintf(mask_str, mask_len, sd->span);
+                       cpumask_scnprintf(mask_str, mask_len,
+                                         sched_domain_span(sd));
                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
                                        itype++) {
index 4530fc65445518272ae851fa44e90378cfd908e1..e9afe63da24b524cbd036692945a8dfe634809d2 100644 (file)
@@ -41,6 +41,8 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
+DEFINE_TRACE(sched_signal_send);
+
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
        return t->sighand->action[sig - 1].sa.sa_handler;
index dc0b3be6b7d52cb98bd84d599bfbb7e7d85b1e85..1ab790c67b174592401712b093bdc56a8aa3d537 100644 (file)
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
 /*
  * Zero means infinite timeout - no checking done:
  */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
index 31deba8f7d160c19bf262b1be13272e2ede1050d..5fc3a0cfb9946e01efc3907f06523964f39d59c1 100644 (file)
@@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms)
        struct task_cputime cputime;
        cputime_t cutime, cstime;
 
-       spin_lock_irq(&current->sighand->siglock);
        thread_group_cputime(current, &cputime);
+       spin_lock_irq(&current->sighand->siglock);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
        spin_unlock_irq(&current->sighand->siglock);
index 3d56fe7570daede300fe01f2e6571416107b081b..c83f566e940abaac4cdef8b7744746104aaabd38 100644 (file)
@@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &ftrace_enable_sysctl,
        },
 #endif
+#ifdef CONFIG_TRACING
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "ftrace_dump_on_oops",
+               .data           = &ftrace_dump_on_oops,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
 #ifdef CONFIG_MODULES
        {
                .ctl_name       = KERN_MODPROBE,
index bd6be76303cf23b4c52b757a93bf1966bdbb4f0a..6d7dc4ec4aa5735dbfd02ae68593c98ae5bf315d 100644 (file)
@@ -352,7 +352,7 @@ static int parse(struct nlattr *na, cpumask_t *mask)
        if (!data)
                return -ENOMEM;
        nla_strlcpy(data, na, len);
-       ret = cpulist_parse(data, *mask);
+       ret = cpulist_parse(data, mask);
        kfree(data);
        return ret;
 }
index f8d968063cea38e348f63590ce73d409385cc5e9..ea2f48af83cff4dde64764da811744f74cddb745 100644 (file)
@@ -166,6 +166,8 @@ static void clockevents_notify_released(void)
 void clockevents_register_device(struct clock_event_device *dev)
 {
        BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
+       BUG_ON(!dev->cpumask);
+
        /*
         * A nsec2cyc multiplicator of 0 is invalid and we'd crash
         * on it, so fix it up and emit a warning:
index f98a1b7b16e942018ffe9e5998756405695da9a4..9590af2327be382cc030ef79acdf6d1d26bd9599 100644 (file)
@@ -150,7 +150,7 @@ static void tick_do_broadcast(cpumask_t mask)
                 */
                cpu = first_cpu(mask);
                td = &per_cpu(tick_cpu_device, cpu);
-               td->evtdev->broadcast(mask);
+               td->evtdev->broadcast(&mask);
        }
 }
 
index df12434b43ca09ec5ea9b0c48d679b842feddc9e..f8372be74122aa3f10b3dfc6642ffde2dc297920 100644 (file)
@@ -136,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
  */
 static void tick_setup_device(struct tick_device *td,
                              struct clock_event_device *newdev, int cpu,
-                             const cpumask_t *cpumask)
+                             const struct cpumask *cpumask)
 {
        ktime_t next_event;
        void (*handler)(struct clock_event_device *) = NULL;
@@ -171,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
         * When the device is not per cpu, pin the interrupt to the
         * current cpu:
         */
-       if (!cpus_equal(newdev->cpumask, *cpumask))
-               irq_set_affinity(newdev->irq, *cpumask);
+       if (!cpumask_equal(newdev->cpumask, cpumask))
+               irq_set_affinity(newdev->irq, cpumask);
 
        /*
         * When global broadcasting is active, check if the current
@@ -202,14 +202,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
        spin_lock_irqsave(&tick_device_lock, flags);
 
        cpu = smp_processor_id();
-       if (!cpu_isset(cpu, newdev->cpumask))
+       if (!cpumask_test_cpu(cpu, newdev->cpumask))
                goto out_bc;
 
        td = &per_cpu(tick_cpu_device, cpu);
        curdev = td->evtdev;
 
        /* cpu local device ? */
-       if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
+       if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
 
                /*
                 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
                 * If we have a cpu local device already, do not replace it
                 * by a non cpu local device
                 */
-               if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
+               if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
                        goto out_bc;
        }
 
index 342fc9ccab46d5132aae1c58b1fbce0d02a2feaf..70f872c71f4e47dbaef945cc99990fc9ce01d1db 100644 (file)
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
        if (!ts->tick_stopped)
                return;
 
-       cpu_clear(cpu, nohz_cpu_mask);
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
        now = ktime_get();
        ts->idle_waketime = now;
 
@@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle)
        if ((long)delta_jiffies >= 1) {
 
                if (delta_jiffies > 1)
-                       cpu_set(cpu, nohz_cpu_mask);
+                       cpumask_set_cpu(cpu, nohz_cpu_mask);
                /*
                 * nohz_stop_sched_tick can be called several times before
                 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                                /*
                                 * sched tick not stopped!
                                 */
-                               cpu_clear(cpu, nohz_cpu_mask);
+                               cpumask_clear_cpu(cpu, nohz_cpu_mask);
                                goto out;
                        }
 
@@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * softirq.
                 */
                tick_do_update_jiffies64(ktime_get());
-               cpu_clear(cpu, nohz_cpu_mask);
+               cpumask_clear_cpu(cpu, nohz_cpu_mask);
        }
        raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void)
        select_nohz_load_balancer(0);
        now = ktime_get();
        tick_do_update_jiffies64(now);
-       cpu_clear(cpu, nohz_cpu_mask);
+       cpumask_clear_cpu(cpu, nohz_cpu_mask);
 
        /*
         * We stopped the tick in idle. Update process times would miss the
index 33dbefd471e88f9571f299f92b433188dd6697de..bde6f03512d50ea9322226f2c99c80fac77a22ad 100644 (file)
@@ -3,18 +3,34 @@
 #  select HAVE_FUNCTION_TRACER:
 #
 
+config USER_STACKTRACE_SUPPORT
+       bool
+
 config NOP_TRACER
        bool
 
 config HAVE_FUNCTION_TRACER
        bool
 
+config HAVE_FUNCTION_GRAPH_TRACER
+       bool
+
+config HAVE_FUNCTION_TRACE_MCOUNT_TEST
+       bool
+       help
+        This gets selected when the arch tests the function_trace_stop
+        variable at the mcount call site. Otherwise, this variable
+        is tested by the called function.
+
 config HAVE_DYNAMIC_FTRACE
        bool
 
 config HAVE_FTRACE_MCOUNT_RECORD
        bool
 
+config HAVE_HW_BRANCH_TRACER
+       bool
+
 config TRACER_MAX_TRACE
        bool
 
@@ -47,6 +63,20 @@ config FUNCTION_TRACER
          (the bootup default), then the overhead of the instructions is very
          small and not measurable even in micro-benchmarks.
 
+config FUNCTION_GRAPH_TRACER
+       bool "Kernel Function Graph Tracer"
+       depends on HAVE_FUNCTION_GRAPH_TRACER
+       depends on FUNCTION_TRACER
+       default y
+       help
+         Enable the kernel to trace a function at both its return
+         and its entry.
+         It's first purpose is to trace the duration of functions and
+         draw a call graph for each thread with some informations like
+         the return value.
+         This is done by setting the current return address on the current
+         task structure into a stack of calls.
+
 config IRQSOFF_TRACER
        bool "Interrupts-off Latency Tracer"
        default n
@@ -138,6 +168,70 @@ config BOOT_TRACER
            selected, because the self-tests are an initcall as well and that
            would invalidate the boot trace. )
 
+config TRACE_BRANCH_PROFILING
+       bool "Trace likely/unlikely profiler"
+       depends on DEBUG_KERNEL
+       select TRACING
+       help
+         This tracer profiles all the the likely and unlikely macros
+         in the kernel. It will display the results in:
+
+         /debugfs/tracing/profile_annotated_branch
+
+         Note: this will add a significant overhead, only turn this
+         on if you need to profile the system's use of these macros.
+
+         Say N if unsure.
+
+config PROFILE_ALL_BRANCHES
+       bool "Profile all if conditionals"
+       depends on TRACE_BRANCH_PROFILING
+       help
+         This tracer profiles all branch conditions. Every if ()
+         taken in the kernel is recorded whether it hit or miss.
+         The results will be displayed in:
+
+         /debugfs/tracing/profile_branch
+
+         This configuration, when enabled, will impose a great overhead
+         on the system. This should only be enabled when the system
+         is to be analyzed
+
+         Say N if unsure.
+
+config TRACING_BRANCHES
+       bool
+       help
+         Selected by tracers that will trace the likely and unlikely
+         conditions. This prevents the tracers themselves from being
+         profiled. Profiling the tracing infrastructure can only happen
+         when the likelys and unlikelys are not being traced.
+
+config BRANCH_TRACER
+       bool "Trace likely/unlikely instances"
+       depends on TRACE_BRANCH_PROFILING
+       select TRACING_BRANCHES
+       help
+         This traces the events of likely and unlikely condition
+         calls in the kernel.  The difference between this and the
+         "Trace likely/unlikely profiler" is that this is not a
+         histogram of the callers, but actually places the calling
+         events into a running trace buffer to see when and where the
+         events happened, as well as their results.
+
+         Say N if unsure.
+
+config POWER_TRACER
+       bool "Trace power consumption behavior"
+       depends on DEBUG_KERNEL
+       depends on X86
+       select TRACING
+       help
+         This tracer helps developers to analyze and optimize the kernels
+         power management decisions, specifically the C-state and P-state
+         behavior.
+
+
 config STACK_TRACER
        bool "Trace max stack"
        depends on HAVE_FUNCTION_TRACER
@@ -157,6 +251,14 @@ config STACK_TRACER
 
          Say N if unsure.
 
+config BTS_TRACER
+       depends on HAVE_HW_BRANCH_TRACER
+       bool "Trace branches"
+       select TRACING
+       help
+         This tracer records all branches on the system in a circular
+         buffer giving access to the last N branches for each cpu.
+
 config DYNAMIC_FTRACE
        bool "enable/disable ftrace tracepoints dynamically"
        depends on FUNCTION_TRACER
index c8228b1a49e924386d3b8af6c02186938edf3571..62dc561b6676ab57ced3b65f5804c0e50f31730a 100644 (file)
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 endif
 
+# If unlikely tracing is enabled, do not trace these files
+ifdef CONFIG_TRACING_BRANCHES
+KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+endif
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
@@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
+obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
+obj-$(CONFIG_BTS_TRACER) += trace_bts.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
index 78db083390f07baa7993c48d9fc0f7a635c769d0..a12f80efceaa875c5ef3e1cc34890b40117dcba4 100644 (file)
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
+/* Quick disabling of function tracer. */
+int function_trace_stop;
+
 /*
  * ftrace_disabled is set when an anomaly is discovered.
  * ftrace_disabled is much stronger than ftrace_enabled.
@@ -55,6 +62,7 @@ static int ftrace_disabled __read_mostly;
 
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
+static DEFINE_MUTEX(ftrace_start_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
@@ -63,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
+ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
 
 static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
@@ -79,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
        };
 }
 
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
+{
+       if (!test_tsk_trace_trace(current))
+               return;
+
+       ftrace_pid_function(ip, parent_ip);
+}
+
+static void set_ftrace_pid_function(ftrace_func_t func)
+{
+       /* do not set ftrace_pid_function to itself! */
+       if (func != ftrace_pid_func)
+               ftrace_pid_function = func;
+}
+
 /**
  * clear_ftrace_function - reset the ftrace function
  *
@@ -88,7 +113,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 void clear_ftrace_function(void)
 {
        ftrace_trace_function = ftrace_stub;
+       __ftrace_trace_function = ftrace_stub;
+       ftrace_pid_function = ftrace_stub;
+}
+
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+/*
+ * For those archs that do not test ftrace_trace_stop in their
+ * mcount call site, we need to do it from C.
+ */
+static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
+{
+       if (function_trace_stop)
+               return;
+
+       __ftrace_trace_function(ip, parent_ip);
 }
+#endif
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
@@ -106,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
        ftrace_list = ops;
 
        if (ftrace_enabled) {
+               ftrace_func_t func;
+
+               if (ops->next == &ftrace_list_end)
+                       func = ops->func;
+               else
+                       func = ftrace_list_func;
+
+               if (ftrace_pid_trace) {
+                       set_ftrace_pid_function(func);
+                       func = ftrace_pid_func;
+               }
+
                /*
                 * For one func, simply call it directly.
                 * For more than one func, call the chain.
                 */
-               if (ops->next == &ftrace_list_end)
-                       ftrace_trace_function = ops->func;
-               else
-                       ftrace_trace_function = ftrace_list_func;
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+               ftrace_trace_function = func;
+#else
+               __ftrace_trace_function = func;
+               ftrace_trace_function = ftrace_test_stop_func;
+#endif
        }
 
        spin_unlock(&ftrace_lock);
@@ -152,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 
        if (ftrace_enabled) {
                /* If we only have one func left, then call that directly */
-               if (ftrace_list == &ftrace_list_end ||
-                   ftrace_list->next == &ftrace_list_end)
-                       ftrace_trace_function = ftrace_list->func;
+               if (ftrace_list->next == &ftrace_list_end) {
+                       ftrace_func_t func = ftrace_list->func;
+
+                       if (ftrace_pid_trace) {
+                               set_ftrace_pid_function(func);
+                               func = ftrace_pid_func;
+                       }
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+                       ftrace_trace_function = func;
+#else
+                       __ftrace_trace_function = func;
+#endif
+               }
        }
 
  out:
@@ -163,6 +228,36 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
        return ret;
 }
 
+static void ftrace_update_pid_func(void)
+{
+       ftrace_func_t func;
+
+       /* should not be called from interrupt context */
+       spin_lock(&ftrace_lock);
+
+       if (ftrace_trace_function == ftrace_stub)
+               goto out;
+
+       func = ftrace_trace_function;
+
+       if (ftrace_pid_trace) {
+               set_ftrace_pid_function(func);
+               func = ftrace_pid_func;
+       } else {
+               if (func == ftrace_pid_func)
+                       func = ftrace_pid_function;
+       }
+
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+       ftrace_trace_function = func;
+#else
+       __ftrace_trace_function = func;
+#endif
+
+ out:
+       spin_unlock(&ftrace_lock);
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
@@ -182,6 +277,8 @@ enum {
        FTRACE_UPDATE_TRACE_FUNC        = (1 << 2),
        FTRACE_ENABLE_MCOUNT            = (1 << 3),
        FTRACE_DISABLE_MCOUNT           = (1 << 4),
+       FTRACE_START_FUNC_RET           = (1 << 5),
+       FTRACE_STOP_FUNC_RET            = (1 << 6),
 };
 
 static int ftrace_filtered;
@@ -308,7 +405,7 @@ ftrace_record_ip(unsigned long ip)
 {
        struct dyn_ftrace *rec;
 
-       if (!ftrace_enabled || ftrace_disabled)
+       if (ftrace_disabled)
                return NULL;
 
        rec = ftrace_alloc_dyn_node(ip);
@@ -322,14 +419,51 @@ ftrace_record_ip(unsigned long ip)
        return rec;
 }
 
-#define FTRACE_ADDR ((long)(ftrace_caller))
+static void print_ip_ins(const char *fmt, unsigned char *p)
+{
+       int i;
+
+       printk(KERN_CONT "%s", fmt);
+
+       for (i = 0; i < MCOUNT_INSN_SIZE; i++)
+               printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+}
+
+static void ftrace_bug(int failed, unsigned long ip)
+{
+       switch (failed) {
+       case -EFAULT:
+               FTRACE_WARN_ON_ONCE(1);
+               pr_info("ftrace faulted on modifying ");
+               print_ip_sym(ip);
+               break;
+       case -EINVAL:
+               FTRACE_WARN_ON_ONCE(1);
+               pr_info("ftrace failed to modify ");
+               print_ip_sym(ip);
+               print_ip_ins(" actual: ", (unsigned char *)ip);
+               printk(KERN_CONT "\n");
+               break;
+       case -EPERM:
+               FTRACE_WARN_ON_ONCE(1);
+               pr_info("ftrace faulted on writing ");
+               print_ip_sym(ip);
+               break;
+       default:
+               FTRACE_WARN_ON_ONCE(1);
+               pr_info("ftrace faulted on unknown error ");
+               print_ip_sym(ip);
+       }
+}
+
 
 static int
-__ftrace_replace_code(struct dyn_ftrace *rec,
-                     unsigned char *nop, int enable)
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
        unsigned long ip, fl;
-       unsigned char *call, *old, *new;
+       unsigned long ftrace_addr;
+
+       ftrace_addr = (unsigned long)ftrace_caller;
 
        ip = rec->ip;
 
@@ -388,34 +522,28 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
                }
        }
 
-       call = ftrace_call_replace(ip, FTRACE_ADDR);
-
-       if (rec->flags & FTRACE_FL_ENABLED) {
-               old = nop;
-               new = call;
-       } else {
-               old = call;
-               new = nop;
-       }
-
-       return ftrace_modify_code(ip, old, new);
+       if (rec->flags & FTRACE_FL_ENABLED)
+               return ftrace_make_call(rec, ftrace_addr);
+       else
+               return ftrace_make_nop(NULL, rec, ftrace_addr);
 }
 
 static void ftrace_replace_code(int enable)
 {
        int i, failed;
-       unsigned char *nop = NULL;
        struct dyn_ftrace *rec;
        struct ftrace_page *pg;
 
-       nop = ftrace_nop_replace();
-
        for (pg = ftrace_pages_start; pg; pg = pg->next) {
                for (i = 0; i < pg->index; i++) {
                        rec = &pg->records[i];
 
-                       /* don't modify code that has already faulted */
-                       if (rec->flags & FTRACE_FL_FAILED)
+                       /*
+                        * Skip over free records and records that have
+                        * failed.
+                        */
+                       if (rec->flags & FTRACE_FL_FREE ||
+                           rec->flags & FTRACE_FL_FAILED)
                                continue;
 
                        /* ignore updates to this record's mcount site */
@@ -426,68 +554,30 @@ static void ftrace_replace_code(int enable)
                                unfreeze_record(rec);
                        }
 
-                       failed = __ftrace_replace_code(rec, nop, enable);
+                       failed = __ftrace_replace_code(rec, enable);
                        if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
                                rec->flags |= FTRACE_FL_FAILED;
                                if ((system_state == SYSTEM_BOOTING) ||
                                    !core_kernel_text(rec->ip)) {
                                        ftrace_free_rec(rec);
-                               }
+                               } else
+                                       ftrace_bug(failed, rec->ip);
                        }
                }
        }
 }
 
-static void print_ip_ins(const char *fmt, unsigned char *p)
-{
-       int i;
-
-       printk(KERN_CONT "%s", fmt);
-
-       for (i = 0; i < MCOUNT_INSN_SIZE; i++)
-               printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
-}
-
 static int
-ftrace_code_disable(struct dyn_ftrace *rec)
+ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 {
        unsigned long ip;
-       unsigned char *nop, *call;
        int ret;
 
        ip = rec->ip;
 
-       nop = ftrace_nop_replace();
-       call = ftrace_call_replace(ip, mcount_addr);
-
-       ret = ftrace_modify_code(ip, call, nop);
+       ret = ftrace_make_nop(mod, rec, mcount_addr);
        if (ret) {
-               switch (ret) {
-               case -EFAULT:
-                       FTRACE_WARN_ON_ONCE(1);
-                       pr_info("ftrace faulted on modifying ");
-                       print_ip_sym(ip);
-                       break;
-               case -EINVAL:
-                       FTRACE_WARN_ON_ONCE(1);
-                       pr_info("ftrace failed to modify ");
-                       print_ip_sym(ip);
-                       print_ip_ins(" expected: ", call);
-                       print_ip_ins(" actual: ", (unsigned char *)ip);
-                       print_ip_ins(" replace: ", nop);
-                       printk(KERN_CONT "\n");
-                       break;
-               case -EPERM:
-                       FTRACE_WARN_ON_ONCE(1);
-                       pr_info("ftrace faulted on writing ");
-                       print_ip_sym(ip);
-                       break;
-               default:
-                       FTRACE_WARN_ON_ONCE(1);
-                       pr_info("ftrace faulted on unknown error ");
-                       print_ip_sym(ip);
-               }
-
+               ftrace_bug(ret, ip);
                rec->flags |= FTRACE_FL_FAILED;
                return 0;
        }
@@ -506,6 +596,11 @@ static int __ftrace_modify_code(void *data)
        if (*command & FTRACE_UPDATE_TRACE_FUNC)
                ftrace_update_ftrace_func(ftrace_trace_function);
 
+       if (*command & FTRACE_START_FUNC_RET)
+               ftrace_enable_ftrace_graph_caller();
+       else if (*command & FTRACE_STOP_FUNC_RET)
+               ftrace_disable_ftrace_graph_caller();
+
        return 0;
 }
 
@@ -515,43 +610,43 @@ static void ftrace_run_update_code(int command)
 }
 
 static ftrace_func_t saved_ftrace_func;
-static int ftrace_start;
-static DEFINE_MUTEX(ftrace_start_lock);
+static int ftrace_start_up;
 
-static void ftrace_startup(void)
+static void ftrace_startup_enable(int command)
 {
-       int command = 0;
-
-       if (unlikely(ftrace_disabled))
-               return;
-
-       mutex_lock(&ftrace_start_lock);
-       ftrace_start++;
-       command |= FTRACE_ENABLE_CALLS;
-
        if (saved_ftrace_func != ftrace_trace_function) {
                saved_ftrace_func = ftrace_trace_function;
                command |= FTRACE_UPDATE_TRACE_FUNC;
        }
 
        if (!command || !ftrace_enabled)
-               goto out;
+               return;
 
        ftrace_run_update_code(command);
- out:
-       mutex_unlock(&ftrace_start_lock);
 }
 
-static void ftrace_shutdown(void)
+static void ftrace_startup(int command)
 {
-       int command = 0;
+       if (unlikely(ftrace_disabled))
+               return;
 
+       mutex_lock(&ftrace_start_lock);
+       ftrace_start_up++;
+       command |= FTRACE_ENABLE_CALLS;
+
+       ftrace_startup_enable(command);
+
+       mutex_unlock(&ftrace_start_lock);
+}
+
+static void ftrace_shutdown(int command)
+{
        if (unlikely(ftrace_disabled))
                return;
 
        mutex_lock(&ftrace_start_lock);
-       ftrace_start--;
-       if (!ftrace_start)
+       ftrace_start_up--;
+       if (!ftrace_start_up)
                command |= FTRACE_DISABLE_CALLS;
 
        if (saved_ftrace_func != ftrace_trace_function) {
@@ -577,8 +672,8 @@ static void ftrace_startup_sysctl(void)
        mutex_lock(&ftrace_start_lock);
        /* Force update next time */
        saved_ftrace_func = NULL;
-       /* ftrace_start is true if we want ftrace running */
-       if (ftrace_start)
+       /* ftrace_start_up is true if we want ftrace running */
+       if (ftrace_start_up)
                command |= FTRACE_ENABLE_CALLS;
 
        ftrace_run_update_code(command);
@@ -593,8 +688,8 @@ static void ftrace_shutdown_sysctl(void)
                return;
 
        mutex_lock(&ftrace_start_lock);
-       /* ftrace_start is true if ftrace is running */
-       if (ftrace_start)
+       /* ftrace_start_up is true if ftrace is running */
+       if (ftrace_start_up)
                command |= FTRACE_DISABLE_CALLS;
 
        ftrace_run_update_code(command);
@@ -605,7 +700,7 @@ static cycle_t              ftrace_update_time;
 static unsigned long   ftrace_update_cnt;
 unsigned long          ftrace_update_tot_cnt;
 
-static int ftrace_update_code(void)
+static int ftrace_update_code(struct module *mod)
 {
        struct dyn_ftrace *p, *t;
        cycle_t start, stop;
@@ -622,7 +717,7 @@ static int ftrace_update_code(void)
                list_del_init(&p->list);
 
                /* convert record (i.e, patch mcount-call with NOP) */
-               if (ftrace_code_disable(p)) {
+               if (ftrace_code_disable(mod, p)) {
                        p->flags |= FTRACE_FL_CONVERTED;
                        ftrace_update_cnt++;
                } else
@@ -690,7 +785,6 @@ enum {
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
-       loff_t                  pos;
        struct ftrace_page      *pg;
        unsigned                idx;
        unsigned                flags;
@@ -715,6 +809,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                        iter->pg = iter->pg->next;
                        iter->idx = 0;
                        goto retry;
+               } else {
+                       iter->idx = -1;
                }
        } else {
                rec = &iter->pg->records[iter->idx++];
@@ -737,8 +833,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
        }
        spin_unlock(&ftrace_lock);
 
-       iter->pos = *pos;
-
        return rec;
 }
 
@@ -746,13 +840,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 {
        struct ftrace_iterator *iter = m->private;
        void *p = NULL;
-       loff_t l = -1;
 
-       if (*pos > iter->pos)
-               *pos = iter->pos;
+       if (*pos > 0) {
+               if (iter->idx < 0)
+                       return p;
+               (*pos)--;
+               iter->idx--;
+       }
 
-       l = *pos;
-       p = t_next(m, p, &l);
+       p = t_next(m, p, pos);
 
        return p;
 }
@@ -763,21 +859,15 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
-       struct ftrace_iterator *iter = m->private;
        struct dyn_ftrace *rec = v;
        char str[KSYM_SYMBOL_LEN];
-       int ret = 0;
 
        if (!rec)
                return 0;
 
        kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 
-       ret = seq_printf(m, "%s\n", str);
-       if (ret < 0) {
-               iter->pos--;
-               iter->idx--;
-       }
+       seq_printf(m, "%s\n", str);
 
        return 0;
 }
@@ -803,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
                return -ENOMEM;
 
        iter->pg = ftrace_pages_start;
-       iter->pos = 0;
 
        ret = seq_open(file, &show_ftrace_seq_ops);
        if (!ret) {
@@ -890,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
 
        if (file->f_mode & FMODE_READ) {
                iter->pg = ftrace_pages_start;
-               iter->pos = 0;
                iter->flags = enable ? FTRACE_ITER_FILTER :
                        FTRACE_ITER_NOTRACE;
 
@@ -1181,7 +1269,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 
        mutex_lock(&ftrace_sysctl_lock);
        mutex_lock(&ftrace_start_lock);
-       if (ftrace_start && ftrace_enabled)
+       if (ftrace_start_up && ftrace_enabled)
                ftrace_run_update_code(FTRACE_ENABLE_CALLS);
        mutex_unlock(&ftrace_start_lock);
        mutex_unlock(&ftrace_sysctl_lock);
@@ -1233,12 +1321,233 @@ static struct file_operations ftrace_notrace_fops = {
        .release = ftrace_notrace_release,
 };
 
-static __init int ftrace_init_debugfs(void)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static DEFINE_MUTEX(graph_lock);
+
+int ftrace_graph_count;
+unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
+
+static void *
+g_next(struct seq_file *m, void *v, loff_t *pos)
 {
-       struct dentry *d_tracer;
-       struct dentry *entry;
+       unsigned long *array = m->private;
+       int index = *pos;
 
-       d_tracer = tracing_init_dentry();
+       (*pos)++;
+
+       if (index >= ftrace_graph_count)
+               return NULL;
+
+       return &array[index];
+}
+
+static void *g_start(struct seq_file *m, loff_t *pos)
+{
+       void *p = NULL;
+
+       mutex_lock(&graph_lock);
+
+       p = g_next(m, p, pos);
+
+       return p;
+}
+
+static void g_stop(struct seq_file *m, void *p)
+{
+       mutex_unlock(&graph_lock);
+}
+
+static int g_show(struct seq_file *m, void *v)
+{
+       unsigned long *ptr = v;
+       char str[KSYM_SYMBOL_LEN];
+
+       if (!ptr)
+               return 0;
+
+       kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
+
+       seq_printf(m, "%s\n", str);
+
+       return 0;
+}
+
+static struct seq_operations ftrace_graph_seq_ops = {
+       .start = g_start,
+       .next = g_next,
+       .stop = g_stop,
+       .show = g_show,
+};
+
+static int
+ftrace_graph_open(struct inode *inode, struct file *file)
+{
+       int ret = 0;
+
+       if (unlikely(ftrace_disabled))
+               return -ENODEV;
+
+       mutex_lock(&graph_lock);
+       if ((file->f_mode & FMODE_WRITE) &&
+           !(file->f_flags & O_APPEND)) {
+               ftrace_graph_count = 0;
+               memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
+       }
+
+       if (file->f_mode & FMODE_READ) {
+               ret = seq_open(file, &ftrace_graph_seq_ops);
+               if (!ret) {
+                       struct seq_file *m = file->private_data;
+                       m->private = ftrace_graph_funcs;
+               }
+       } else
+               file->private_data = ftrace_graph_funcs;
+       mutex_unlock(&graph_lock);
+
+       return ret;
+}
+
+static ssize_t
+ftrace_graph_read(struct file *file, char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+       if (file->f_mode & FMODE_READ)
+               return seq_read(file, ubuf, cnt, ppos);
+       else
+               return -EPERM;
+}
+
+static int
+ftrace_set_func(unsigned long *array, int idx, char *buffer)
+{
+       char str[KSYM_SYMBOL_LEN];
+       struct dyn_ftrace *rec;
+       struct ftrace_page *pg;
+       int found = 0;
+       int i, j;
+
+       if (ftrace_disabled)
+               return -ENODEV;
+
+       /* should not be called from interrupt context */
+       spin_lock(&ftrace_lock);
+
+       for (pg = ftrace_pages_start; pg; pg = pg->next) {
+               for (i = 0; i < pg->index; i++) {
+                       rec = &pg->records[i];
+
+                       if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+                               continue;
+
+                       kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+                       if (strcmp(str, buffer) == 0) {
+                               found = 1;
+                               for (j = 0; j < idx; j++)
+                                       if (array[j] == rec->ip) {
+                                               found = 0;
+                                               break;
+                                       }
+                               if (found)
+                                       array[idx] = rec->ip;
+                               break;
+                       }
+               }
+       }
+       spin_unlock(&ftrace_lock);
+
+       return found ? 0 : -EINVAL;
+}
+
+static ssize_t
+ftrace_graph_write(struct file *file, const char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+       unsigned char buffer[FTRACE_BUFF_MAX+1];
+       unsigned long *array;
+       size_t read = 0;
+       ssize_t ret;
+       int index = 0;
+       char ch;
+
+       if (!cnt || cnt < 0)
+               return 0;
+
+       mutex_lock(&graph_lock);
+
+       if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       if (file->f_mode & FMODE_READ) {
+               struct seq_file *m = file->private_data;
+               array = m->private;
+       } else
+               array = file->private_data;
+
+       ret = get_user(ch, ubuf++);
+       if (ret)
+               goto out;
+       read++;
+       cnt--;
+
+       /* skip white space */
+       while (cnt && isspace(ch)) {
+               ret = get_user(ch, ubuf++);
+               if (ret)
+                       goto out;
+               read++;
+               cnt--;
+       }
+
+       if (isspace(ch)) {
+               *ppos += read;
+               ret = read;
+               goto out;
+       }
+
+       while (cnt && !isspace(ch)) {
+               if (index < FTRACE_BUFF_MAX)
+                       buffer[index++] = ch;
+               else {
+                       ret = -EINVAL;
+                       goto out;
+               }
+               ret = get_user(ch, ubuf++);
+               if (ret)
+                       goto out;
+               read++;
+               cnt--;
+       }
+       buffer[index] = 0;
+
+       /* we allow only one at a time */
+       ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+       if (ret)
+               goto out;
+
+       ftrace_graph_count++;
+
+       file->f_pos += read;
+
+       ret = read;
+ out:
+       mutex_unlock(&graph_lock);
+
+       return ret;
+}
+
+static const struct file_operations ftrace_graph_fops = {
+       .open = ftrace_graph_open,
+       .read = ftrace_graph_read,
+       .write = ftrace_graph_write,
+};
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
+{
+       struct dentry *entry;
 
        entry = debugfs_create_file("available_filter_functions", 0444,
                                    d_tracer, NULL, &ftrace_avail_fops);
@@ -1263,12 +1572,20 @@ static __init int ftrace_init_debugfs(void)
                pr_warning("Could not create debugfs "
                           "'set_ftrace_notrace' entry\n");
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+                                   NULL,
+                                   &ftrace_graph_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'set_graph_function' entry\n");
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
        return 0;
 }
 
-fs_initcall(ftrace_init_debugfs);
-
-static int ftrace_convert_nops(unsigned long *start,
+static int ftrace_convert_nops(struct module *mod,
+                              unsigned long *start,
                               unsigned long *end)
 {
        unsigned long *p;
@@ -1279,23 +1596,32 @@ static int ftrace_convert_nops(unsigned long *start,
        p = start;
        while (p < end) {
                addr = ftrace_call_adjust(*p++);
+               /*
+                * Some architecture linkers will pad between
+                * the different mcount_loc sections of different
+                * object files to satisfy alignments.
+                * Skip any NULL pointers.
+                */
+               if (!addr)
+                       continue;
                ftrace_record_ip(addr);
        }
 
        /* disable interrupts to prevent kstop machine */
        local_irq_save(flags);
-       ftrace_update_code();
+       ftrace_update_code(mod);
        local_irq_restore(flags);
        mutex_unlock(&ftrace_start_lock);
 
        return 0;
 }
 
-void ftrace_init_module(unsigned long *start, unsigned long *end)
+void ftrace_init_module(struct module *mod,
+                       unsigned long *start, unsigned long *end)
 {
        if (ftrace_disabled || start == end)
                return;
-       ftrace_convert_nops(start, end);
+       ftrace_convert_nops(mod, start, end);
 }
 
 extern unsigned long __start_mcount_loc[];
@@ -1325,7 +1651,8 @@ void __init ftrace_init(void)
 
        last_ftrace_enabled = ftrace_enabled = 1;
 
-       ret = ftrace_convert_nops(__start_mcount_loc,
+       ret = ftrace_convert_nops(NULL,
+                                 __start_mcount_loc,
                                  __stop_mcount_loc);
 
        return;
@@ -1342,12 +1669,186 @@ static int __init ftrace_nodyn_init(void)
 }
 device_initcall(ftrace_nodyn_init);
 
-# define ftrace_startup()              do { } while (0)
-# define ftrace_shutdown()             do { } while (0)
+static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
+static inline void ftrace_startup_enable(int command) { }
+/* Keep as macros so we do not need to define the commands */
+# define ftrace_startup(command)       do { } while (0)
+# define ftrace_shutdown(command)      do { } while (0)
 # define ftrace_startup_sysctl()       do { } while (0)
 # define ftrace_shutdown_sysctl()      do { } while (0)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+static ssize_t
+ftrace_pid_read(struct file *file, char __user *ubuf,
+                      size_t cnt, loff_t *ppos)
+{
+       char buf[64];
+       int r;
+
+       if (ftrace_pid_trace == ftrace_swapper_pid)
+               r = sprintf(buf, "swapper tasks\n");
+       else if (ftrace_pid_trace)
+               r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
+       else
+               r = sprintf(buf, "no pid\n");
+
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static void clear_ftrace_swapper(void)
+{
+       struct task_struct *p;
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               p = idle_task(cpu);
+               clear_tsk_trace_trace(p);
+       }
+       put_online_cpus();
+}
+
+static void set_ftrace_swapper(void)
+{
+       struct task_struct *p;
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               p = idle_task(cpu);
+               set_tsk_trace_trace(p);
+       }
+       put_online_cpus();
+}
+
+static void clear_ftrace_pid(struct pid *pid)
+{
+       struct task_struct *p;
+
+       do_each_pid_task(pid, PIDTYPE_PID, p) {
+               clear_tsk_trace_trace(p);
+       } while_each_pid_task(pid, PIDTYPE_PID, p);
+       put_pid(pid);
+}
+
+static void set_ftrace_pid(struct pid *pid)
+{
+       struct task_struct *p;
+
+       do_each_pid_task(pid, PIDTYPE_PID, p) {
+               set_tsk_trace_trace(p);
+       } while_each_pid_task(pid, PIDTYPE_PID, p);
+}
+
+static void clear_ftrace_pid_task(struct pid **pid)
+{
+       if (*pid == ftrace_swapper_pid)
+               clear_ftrace_swapper();
+       else
+               clear_ftrace_pid(*pid);
+
+       *pid = NULL;
+}
+
+static void set_ftrace_pid_task(struct pid *pid)
+{
+       if (pid == ftrace_swapper_pid)
+               set_ftrace_swapper();
+       else
+               set_ftrace_pid(pid);
+}
+
+static ssize_t
+ftrace_pid_write(struct file *filp, const char __user *ubuf,
+                  size_t cnt, loff_t *ppos)
+{
+       struct pid *pid;
+       char buf[64];
+       long val;
+       int ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtol(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       mutex_lock(&ftrace_start_lock);
+       if (val < 0) {
+               /* disable pid tracing */
+               if (!ftrace_pid_trace)
+                       goto out;
+
+               clear_ftrace_pid_task(&ftrace_pid_trace);
+
+       } else {
+               /* swapper task is special */
+               if (!val) {
+                       pid = ftrace_swapper_pid;
+                       if (pid == ftrace_pid_trace)
+                               goto out;
+               } else {
+                       pid = find_get_pid(val);
+
+                       if (pid == ftrace_pid_trace) {
+                               put_pid(pid);
+                               goto out;
+                       }
+               }
+
+               if (ftrace_pid_trace)
+                       clear_ftrace_pid_task(&ftrace_pid_trace);
+
+               if (!pid)
+                       goto out;
+
+               ftrace_pid_trace = pid;
+
+               set_ftrace_pid_task(ftrace_pid_trace);
+       }
+
+       /* update the function call */
+       ftrace_update_pid_func();
+       ftrace_startup_enable(0);
+
+ out:
+       mutex_unlock(&ftrace_start_lock);
+
+       return cnt;
+}
+
+static struct file_operations ftrace_pid_fops = {
+       .read = ftrace_pid_read,
+       .write = ftrace_pid_write,
+};
+
+static __init int ftrace_init_debugfs(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *entry;
+
+       d_tracer = tracing_init_dentry();
+       if (!d_tracer)
+               return 0;
+
+       ftrace_init_dyn_debugfs(d_tracer);
+
+       entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
+                                   NULL, &ftrace_pid_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'set_ftrace_pid' entry\n");
+       return 0;
+}
+
+fs_initcall(ftrace_init_debugfs);
+
 /**
  * ftrace_kill - kill ftrace
  *
@@ -1381,10 +1882,11 @@ int register_ftrace_function(struct ftrace_ops *ops)
                return -1;
 
        mutex_lock(&ftrace_sysctl_lock);
+
        ret = __register_ftrace_function(ops);
-       ftrace_startup();
-       mutex_unlock(&ftrace_sysctl_lock);
+       ftrace_startup(0);
 
+       mutex_unlock(&ftrace_sysctl_lock);
        return ret;
 }
 
@@ -1400,7 +1902,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
        mutex_lock(&ftrace_sysctl_lock);
        ret = __unregister_ftrace_function(ops);
-       ftrace_shutdown();
+       ftrace_shutdown(0);
        mutex_unlock(&ftrace_sysctl_lock);
 
        return ret;
@@ -1449,3 +1951,153 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
        return ret;
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+static atomic_t ftrace_graph_active;
+
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+       return 0;
+}
+
+/* The callbacks that hook a function */
+trace_func_graph_ret_t ftrace_graph_return =
+                       (trace_func_graph_ret_t)ftrace_stub;
+trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+
+/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+{
+       int i;
+       int ret = 0;
+       unsigned long flags;
+       int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
+       struct task_struct *g, *t;
+
+       for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
+               ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
+                                       * sizeof(struct ftrace_ret_stack),
+                                       GFP_KERNEL);
+               if (!ret_stack_list[i]) {
+                       start = 0;
+                       end = i;
+                       ret = -ENOMEM;
+                       goto free;
+               }
+       }
+
+       read_lock_irqsave(&tasklist_lock, flags);
+       do_each_thread(g, t) {
+               if (start == end) {
+                       ret = -EAGAIN;
+                       goto unlock;
+               }
+
+               if (t->ret_stack == NULL) {
+                       t->curr_ret_stack = -1;
+                       /* Make sure IRQs see the -1 first: */
+                       barrier();
+                       t->ret_stack = ret_stack_list[start++];
+                       atomic_set(&t->tracing_graph_pause, 0);
+                       atomic_set(&t->trace_overrun, 0);
+               }
+       } while_each_thread(g, t);
+
+unlock:
+       read_unlock_irqrestore(&tasklist_lock, flags);
+free:
+       for (i = start; i < end; i++)
+               kfree(ret_stack_list[i]);
+       return ret;
+}
+
+/* Allocate a return stack for each task */
+static int start_graph_tracing(void)
+{
+       struct ftrace_ret_stack **ret_stack_list;
+       int ret;
+
+       ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
+                               sizeof(struct ftrace_ret_stack *),
+                               GFP_KERNEL);
+
+       if (!ret_stack_list)
+               return -ENOMEM;
+
+       do {
+               ret = alloc_retstack_tasklist(ret_stack_list);
+       } while (ret == -EAGAIN);
+
+       kfree(ret_stack_list);
+       return ret;
+}
+
+int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+                       trace_func_graph_ent_t entryfunc)
+{
+       int ret = 0;
+
+       mutex_lock(&ftrace_sysctl_lock);
+
+       atomic_inc(&ftrace_graph_active);
+       ret = start_graph_tracing();
+       if (ret) {
+               atomic_dec(&ftrace_graph_active);
+               goto out;
+       }
+
+       ftrace_graph_return = retfunc;
+       ftrace_graph_entry = entryfunc;
+
+       ftrace_startup(FTRACE_START_FUNC_RET);
+
+out:
+       mutex_unlock(&ftrace_sysctl_lock);
+       return ret;
+}
+
+void unregister_ftrace_graph(void)
+{
+       mutex_lock(&ftrace_sysctl_lock);
+
+       atomic_dec(&ftrace_graph_active);
+       ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+       ftrace_graph_entry = ftrace_graph_entry_stub;
+       ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+
+       mutex_unlock(&ftrace_sysctl_lock);
+}
+
+/* Allocate a return stack for newly created task */
+void ftrace_graph_init_task(struct task_struct *t)
+{
+       if (atomic_read(&ftrace_graph_active)) {
+               t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+                               * sizeof(struct ftrace_ret_stack),
+                               GFP_KERNEL);
+               if (!t->ret_stack)
+                       return;
+               t->curr_ret_stack = -1;
+               atomic_set(&t->tracing_graph_pause, 0);
+               atomic_set(&t->trace_overrun, 0);
+       } else
+               t->ret_stack = NULL;
+}
+
+void ftrace_graph_exit_task(struct task_struct *t)
+{
+       struct ftrace_ret_stack *ret_stack = t->ret_stack;
+
+       t->ret_stack = NULL;
+       /* NULL must become visible to IRQs before we free it: */
+       barrier();
+
+       kfree(ret_stack);
+}
+
+void ftrace_graph_stop(void)
+{
+       ftrace_stop();
+}
+#endif
+
index 668bbb5ef2bd154dc8084fe4480bfd251e484f41..7f69cfeaadf76a1bc002df343d2006ed20204a5e 100644 (file)
 
 #include "trace.h"
 
-/* Global flag to disable all recording to ring buffers */
-static int ring_buffers_off __read_mostly;
+/*
+ * A fast way to enable or disable all ring buffers is to
+ * call tracing_on or tracing_off. Turning off the ring buffers
+ * prevents all ring buffers from being recorded to.
+ * Turning this switch on, makes it OK to write to the
+ * ring buffer, if the ring buffer is enabled itself.
+ *
+ * There's three layers that must be on in order to write
+ * to the ring buffer.
+ *
+ * 1) This global flag must be set.
+ * 2) The ring buffer must be enabled for recording.
+ * 3) The per cpu buffer must be enabled for recording.
+ *
+ * In case of an anomaly, this global flag has a bit set that
+ * will permantly disable all ring buffers.
+ */
+
+/*
+ * Global flag to disable all recording to ring buffers
+ *  This has two bits: ON, DISABLED
+ *
+ *  ON   DISABLED
+ * ---- ----------
+ *   0      0        : ring buffers are off
+ *   1      0        : ring buffers are on
+ *   X      1        : ring buffers are permanently disabled
+ */
+
+enum {
+       RB_BUFFERS_ON_BIT       = 0,
+       RB_BUFFERS_DISABLED_BIT = 1,
+};
+
+enum {
+       RB_BUFFERS_ON           = 1 << RB_BUFFERS_ON_BIT,
+       RB_BUFFERS_DISABLED     = 1 << RB_BUFFERS_DISABLED_BIT,
+};
+
+static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
 /**
  * tracing_on - enable all tracing buffers
@@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;
  */
 void tracing_on(void)
 {
-       ring_buffers_off = 0;
+       set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
 }
 
 /**
@@ -42,9 +80,22 @@ void tracing_on(void)
  */
 void tracing_off(void)
 {
-       ring_buffers_off = 1;
+       clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
 }
 
+/**
+ * tracing_off_permanent - permanently disable ring buffers
+ *
+ * This function, once called, will disable all ring buffers
+ * permanenty.
+ */
+void tracing_off_permanent(void)
+{
+       set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
+}
+
+#include "trace.h"
+
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
@@ -144,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
 #define TS_MASK                ((1ULL << TS_SHIFT) - 1)
 #define TS_DELTA_TEST  (~TS_MASK)
 
-/*
- * This hack stolen from mm/slob.c.
- * We can store per page timing information in the page frame of the page.
- * Thanks to Peter Zijlstra for suggesting this idea.
- */
-struct buffer_page {
+struct buffer_data_page {
        u64              time_stamp;    /* page time stamp */
-       local_t          write;         /* index for next write */
        local_t          commit;        /* write commited index */
+       unsigned char    data[];        /* data of buffer page */
+};
+
+struct buffer_page {
+       local_t          write;         /* index for next write */
        unsigned         read;          /* index for next read */
        struct list_head list;          /* list of free pages */
-       void *page;                     /* Actual data page */
+       struct buffer_data_page *page;  /* Actual data page */
 };
 
+static void rb_init_page(struct buffer_data_page *bpage)
+{
+       local_set(&bpage->commit, 0);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -179,7 +234,7 @@ static inline int test_time_stamp(u64 delta)
        return 0;
 }
 
-#define BUF_PAGE_SIZE PAGE_SIZE
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
 
 /*
  * head_page == tail_page && head == tail then buffer is empty.
@@ -187,7 +242,8 @@ static inline int test_time_stamp(u64 delta)
 struct ring_buffer_per_cpu {
        int                             cpu;
        struct ring_buffer              *buffer;
-       spinlock_t                      lock;
+       spinlock_t                      reader_lock; /* serialize readers */
+       raw_spinlock_t                  lock;
        struct lock_class_key           lock_key;
        struct list_head                pages;
        struct buffer_page              *head_page;     /* read from head */
@@ -221,32 +277,16 @@ struct ring_buffer_iter {
        u64                             read_stamp;
 };
 
+/* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(buffer, cond)                               \
-       do {                                                    \
-               if (unlikely(cond)) {                           \
-                       atomic_inc(&buffer->record_disabled);   \
-                       WARN_ON(1);                             \
-               }                                               \
-       } while (0)
-
-#define RB_WARN_ON_RET(buffer, cond)                           \
-       do {                                                    \
-               if (unlikely(cond)) {                           \
-                       atomic_inc(&buffer->record_disabled);   \
-                       WARN_ON(1);                             \
-                       return -1;                              \
-               }                                               \
-       } while (0)
-
-#define RB_WARN_ON_ONCE(buffer, cond)                          \
-       do {                                                    \
-               static int once;                                \
-               if (unlikely(cond) && !once) {                  \
-                       once++;                                 \
+       ({                                                      \
+               int _____ret = unlikely(cond);                  \
+               if (_____ret) {                                 \
                        atomic_inc(&buffer->record_disabled);   \
                        WARN_ON(1);                             \
                }                                               \
-       } while (0)
+               _____ret;                                       \
+       })
 
 /**
  * check_pages - integrity check of buffer pages
@@ -258,16 +298,20 @@ struct ring_buffer_iter {
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
 {
        struct list_head *head = &cpu_buffer->pages;
-       struct buffer_page *page, *tmp;
+       struct buffer_page *bpage, *tmp;
 
-       RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
-       RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
+       if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
+               return -1;
+       if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
+               return -1;
 
-       list_for_each_entry_safe(page, tmp, head, list) {
-               RB_WARN_ON_RET(cpu_buffer,
-                              page->list.next->prev != &page->list);
-               RB_WARN_ON_RET(cpu_buffer,
-                              page->list.prev->next != &page->list);
+       list_for_each_entry_safe(bpage, tmp, head, list) {
+               if (RB_WARN_ON(cpu_buffer,
+                              bpage->list.next->prev != &bpage->list))
+                       return -1;
+               if (RB_WARN_ON(cpu_buffer,
+                              bpage->list.prev->next != &bpage->list))
+                       return -1;
        }
 
        return 0;
@@ -277,22 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
                             unsigned nr_pages)
 {
        struct list_head *head = &cpu_buffer->pages;
-       struct buffer_page *page, *tmp;
+       struct buffer_page *bpage, *tmp;
        unsigned long addr;
        LIST_HEAD(pages);
        unsigned i;
 
        for (i = 0; i < nr_pages; i++) {
-               page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+               bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                                    GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
-               if (!page)
+               if (!bpage)
                        goto free_pages;
-               list_add(&page->list, &pages);
+               list_add(&bpage->list, &pages);
 
                addr = __get_free_page(GFP_KERNEL);
                if (!addr)
                        goto free_pages;
-               page->page = (void *)addr;
+               bpage->page = (void *)addr;
+               rb_init_page(bpage->page);
        }
 
        list_splice(&pages, head);
@@ -302,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
        return 0;
 
  free_pages:
-       list_for_each_entry_safe(page, tmp, &pages, list) {
-               list_del_init(&page->list);
-               free_buffer_page(page);
+       list_for_each_entry_safe(bpage, tmp, &pages, list) {
+               list_del_init(&bpage->list);
+               free_buffer_page(bpage);
        }
        return -ENOMEM;
 }
@@ -313,7 +358,7 @@ static struct ring_buffer_per_cpu *
 rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
-       struct buffer_page *page;
+       struct buffer_page *bpage;
        unsigned long addr;
        int ret;
 
@@ -324,19 +369,21 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 
        cpu_buffer->cpu = cpu;
        cpu_buffer->buffer = buffer;
-       spin_lock_init(&cpu_buffer->lock);
+       spin_lock_init(&cpu_buffer->reader_lock);
+       cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
        INIT_LIST_HEAD(&cpu_buffer->pages);
 
-       page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+       bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
                            GFP_KERNEL, cpu_to_node(cpu));
-       if (!page)
+       if (!bpage)
                goto fail_free_buffer;
 
-       cpu_buffer->reader_page = page;
+       cpu_buffer->reader_page = bpage;
        addr = __get_free_page(GFP_KERNEL);
        if (!addr)
                goto fail_free_reader;
-       page->page = (void *)addr;
+       bpage->page = (void *)addr;
+       rb_init_page(bpage->page);
 
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 
@@ -361,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 {
        struct list_head *head = &cpu_buffer->pages;
-       struct buffer_page *page, *tmp;
+       struct buffer_page *bpage, *tmp;
 
        list_del_init(&cpu_buffer->reader_page->list);
        free_buffer_page(cpu_buffer->reader_page);
 
-       list_for_each_entry_safe(page, tmp, head, list) {
-               list_del_init(&page->list);
-               free_buffer_page(page);
+       list_for_each_entry_safe(bpage, tmp, head, list) {
+               list_del_init(&bpage->list);
+               free_buffer_page(bpage);
        }
        kfree(cpu_buffer);
 }
@@ -465,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 static void
 rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 {
-       struct buffer_page *page;
+       struct buffer_page *bpage;
        struct list_head *p;
        unsigned i;
 
@@ -473,13 +520,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        synchronize_sched();
 
        for (i = 0; i < nr_pages; i++) {
-               BUG_ON(list_empty(&cpu_buffer->pages));
+               if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+                       return;
                p = cpu_buffer->pages.next;
-               page = list_entry(p, struct buffer_page, list);
-               list_del_init(&page->list);
-               free_buffer_page(page);
+               bpage = list_entry(p, struct buffer_page, list);
+               list_del_init(&bpage->list);
+               free_buffer_page(bpage);
        }
-       BUG_ON(list_empty(&cpu_buffer->pages));
+       if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
+               return;
 
        rb_reset_cpu(cpu_buffer);
 
@@ -493,7 +542,7 @@ static void
 rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
                struct list_head *pages, unsigned nr_pages)
 {
-       struct buffer_page *page;
+       struct buffer_page *bpage;
        struct list_head *p;
        unsigned i;
 
@@ -501,11 +550,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        synchronize_sched();
 
        for (i = 0; i < nr_pages; i++) {
-               BUG_ON(list_empty(pages));
+               if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
+                       return;
                p = pages->next;
-               page = list_entry(p, struct buffer_page, list);
-               list_del_init(&page->list);
-               list_add_tail(&page->list, &cpu_buffer->pages);
+               bpage = list_entry(p, struct buffer_page, list);
+               list_del_init(&bpage->list);
+               list_add_tail(&bpage->list, &cpu_buffer->pages);
        }
        rb_reset_cpu(cpu_buffer);
 
@@ -532,7 +582,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        unsigned nr_pages, rm_pages, new_pages;
-       struct buffer_page *page, *tmp;
+       struct buffer_page *bpage, *tmp;
        unsigned long buffer_size;
        unsigned long addr;
        LIST_HEAD(pages);
@@ -562,7 +612,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        if (size < buffer_size) {
 
                /* easy case, just free pages */
-               BUG_ON(nr_pages >= buffer->pages);
+               if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
+                       mutex_unlock(&buffer->mutex);
+                       return -1;
+               }
 
                rm_pages = buffer->pages - nr_pages;
 
@@ -581,21 +634,26 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
         * add these pages to the cpu_buffers. Otherwise we just free
         * them all and return -ENOMEM;
         */
-       BUG_ON(nr_pages <= buffer->pages);
+       if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
+               mutex_unlock(&buffer->mutex);
+               return -1;
+       }
+
        new_pages = nr_pages - buffer->pages;
 
        for_each_buffer_cpu(buffer, cpu) {
                for (i = 0; i < new_pages; i++) {
-                       page = kzalloc_node(ALIGN(sizeof(*page),
+                       bpage = kzalloc_node(ALIGN(sizeof(*bpage),
                                                  cache_line_size()),
                                            GFP_KERNEL, cpu_to_node(cpu));
-                       if (!page)
+                       if (!bpage)
                                goto free_pages;
-                       list_add(&page->list, &pages);
+                       list_add(&bpage->list, &pages);
                        addr = __get_free_page(GFP_KERNEL);
                        if (!addr)
                                goto free_pages;
-                       page->page = (void *)addr;
+                       bpage->page = (void *)addr;
+                       rb_init_page(bpage->page);
                }
        }
 
@@ -604,7 +662,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
                rb_insert_pages(cpu_buffer, &pages, new_pages);
        }
 
-       BUG_ON(!list_empty(&pages));
+       if (RB_WARN_ON(buffer, !list_empty(&pages))) {
+               mutex_unlock(&buffer->mutex);
+               return -1;
+       }
 
  out:
        buffer->pages = nr_pages;
@@ -613,9 +674,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
        return size;
 
  free_pages:
-       list_for_each_entry_safe(page, tmp, &pages, list) {
-               list_del_init(&page->list);
-               free_buffer_page(page);
+       list_for_each_entry_safe(bpage, tmp, &pages, list) {
+               list_del_init(&bpage->list);
+               free_buffer_page(bpage);
        }
        mutex_unlock(&buffer->mutex);
        return -ENOMEM;
@@ -626,9 +687,15 @@ static inline int rb_null_event(struct ring_buffer_event *event)
        return event->type == RINGBUF_TYPE_PADDING;
 }
 
-static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
+static inline void *
+__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
+{
+       return bpage->data + index;
+}
+
+static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
 {
-       return page->page + index;
+       return bpage->page->data + index;
 }
 
 static inline struct ring_buffer_event *
@@ -658,7 +725,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage)
 
 static inline unsigned rb_page_commit(struct buffer_page *bpage)
 {
-       return local_read(&bpage->commit);
+       return local_read(&bpage->page->commit);
 }
 
 /* Size is determined by what has been commited */
@@ -693,7 +760,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
             head += rb_event_length(event)) {
 
                event = __rb_page_index(cpu_buffer->head_page, head);
-               BUG_ON(rb_null_event(event));
+               if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+                       return;
                /* Only count data entries */
                if (event->type != RINGBUF_TYPE_DATA)
                        continue;
@@ -703,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 }
 
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
-                              struct buffer_page **page)
+                              struct buffer_page **bpage)
 {
-       struct list_head *p = (*page)->list.next;
+       struct list_head *p = (*bpage)->list.next;
 
        if (p == &cpu_buffer->pages)
                p = p->next;
 
-       *page = list_entry(p, struct buffer_page, list);
+       *bpage = list_entry(p, struct buffer_page, list);
 }
 
 static inline unsigned
@@ -746,16 +814,18 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
        addr &= PAGE_MASK;
 
        while (cpu_buffer->commit_page->page != (void *)addr) {
-               RB_WARN_ON(cpu_buffer,
-                          cpu_buffer->commit_page == cpu_buffer->tail_page);
-               cpu_buffer->commit_page->commit =
+               if (RB_WARN_ON(cpu_buffer,
+                         cpu_buffer->commit_page == cpu_buffer->tail_page))
+                       return;
+               cpu_buffer->commit_page->page->commit =
                        cpu_buffer->commit_page->write;
                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-               cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+               cpu_buffer->write_stamp =
+                       cpu_buffer->commit_page->page->time_stamp;
        }
 
        /* Now set the commit to the event's index */
-       local_set(&cpu_buffer->commit_page->commit, index);
+       local_set(&cpu_buffer->commit_page->page->commit, index);
 }
 
 static inline void
@@ -770,16 +840,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
         * assign the commit to the tail.
         */
        while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
-               cpu_buffer->commit_page->commit =
+               cpu_buffer->commit_page->page->commit =
                        cpu_buffer->commit_page->write;
                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-               cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
+               cpu_buffer->write_stamp =
+                       cpu_buffer->commit_page->page->time_stamp;
                /* add barrier to keep gcc from optimizing too much */
                barrier();
        }
        while (rb_commit_index(cpu_buffer) !=
               rb_page_write(cpu_buffer->commit_page)) {
-               cpu_buffer->commit_page->commit =
+               cpu_buffer->commit_page->page->commit =
                        cpu_buffer->commit_page->write;
                barrier();
        }
@@ -787,7 +858,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 
 static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 {
-       cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
+       cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
        cpu_buffer->reader_page->read = 0;
 }
 
@@ -806,7 +877,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
        else
                rb_inc_page(cpu_buffer, &iter->head_page);
 
-       iter->read_stamp = iter->head_page->time_stamp;
+       iter->read_stamp = iter->head_page->page->time_stamp;
        iter->head = 0;
 }
 
@@ -894,7 +965,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        if (write > BUF_PAGE_SIZE) {
                struct buffer_page *next_page = tail_page;
 
-               spin_lock_irqsave(&cpu_buffer->lock, flags);
+               local_irq_save(flags);
+               __raw_spin_lock(&cpu_buffer->lock);
 
                rb_inc_page(cpu_buffer, &next_page);
 
@@ -902,7 +974,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                reader_page = cpu_buffer->reader_page;
 
                /* we grabbed the lock before incrementing */
-               RB_WARN_ON(cpu_buffer, next_page == reader_page);
+               if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+                       goto out_unlock;
 
                /*
                 * If for some reason, we had an interrupt storm that made
@@ -940,12 +1013,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                 */
                if (tail_page == cpu_buffer->tail_page) {
                        local_set(&next_page->write, 0);
-                       local_set(&next_page->commit, 0);
+                       local_set(&next_page->page->commit, 0);
                        cpu_buffer->tail_page = next_page;
 
                        /* reread the time stamp */
                        *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
-                       cpu_buffer->tail_page->time_stamp = *ts;
+                       cpu_buffer->tail_page->page->time_stamp = *ts;
                }
 
                /*
@@ -970,7 +1043,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                        rb_set_commit_to_write(cpu_buffer);
                }
 
-               spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+               __raw_spin_unlock(&cpu_buffer->lock);
+               local_irq_restore(flags);
 
                /* fail and let the caller try again */
                return ERR_PTR(-EAGAIN);
@@ -978,7 +1052,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
        /* We reserved something on the buffer */
 
-       BUG_ON(write > BUF_PAGE_SIZE);
+       if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+               return NULL;
 
        event = __rb_page_index(tail_page, tail);
        rb_update_event(event, type, length);
@@ -988,12 +1063,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
         * this page's time stamp.
         */
        if (!tail && rb_is_commit(cpu_buffer, event))
-               cpu_buffer->commit_page->time_stamp = *ts;
+               cpu_buffer->commit_page->page->time_stamp = *ts;
 
        return event;
 
  out_unlock:
-       spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+       __raw_spin_unlock(&cpu_buffer->lock);
+       local_irq_restore(flags);
        return NULL;
 }
 
@@ -1038,7 +1114,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                        event->time_delta = *delta & TS_MASK;
                        event->array[0] = *delta >> TS_SHIFT;
                } else {
-                       cpu_buffer->commit_page->time_stamp = *ts;
+                       cpu_buffer->commit_page->page->time_stamp = *ts;
                        event->time_delta = 0;
                        event->array[0] = 0;
                }
@@ -1076,10 +1152,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         * storm or we have something buggy.
         * Bail!
         */
-       if (unlikely(++nr_loops > 1000)) {
-               RB_WARN_ON(cpu_buffer, 1);
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                return NULL;
-       }
 
        ts = ring_buffer_time_stamp(cpu_buffer->cpu);
 
@@ -1175,15 +1249,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
        struct ring_buffer_event *event;
        int cpu, resched;
 
-       if (ring_buffers_off)
+       if (ring_buffer_flags != RB_BUFFERS_ON)
                return NULL;
 
        if (atomic_read(&buffer->record_disabled))
                return NULL;
 
        /* If we are tracing schedule, we don't want to recurse */
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
 
        cpu = raw_smp_processor_id();
 
@@ -1214,10 +1287,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
        return event;
 
  out:
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
        return NULL;
 }
 
@@ -1259,12 +1329,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
        /*
         * Only the last preempt count needs to restore preemption.
         */
-       if (preempt_count() == 1) {
-               if (per_cpu(rb_need_resched, cpu))
-                       preempt_enable_no_resched_notrace();
-               else
-                       preempt_enable_notrace();
-       } else
+       if (preempt_count() == 1)
+               ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+       else
                preempt_enable_no_resched_notrace();
 
        return 0;
@@ -1294,14 +1361,13 @@ int ring_buffer_write(struct ring_buffer *buffer,
        int ret = -EBUSY;
        int cpu, resched;
 
-       if (ring_buffers_off)
+       if (ring_buffer_flags != RB_BUFFERS_ON)
                return -EBUSY;
 
        if (atomic_read(&buffer->record_disabled))
                return -EBUSY;
 
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
 
        cpu = raw_smp_processor_id();
 
@@ -1327,10 +1393,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
        ret = 0;
  out:
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
 
        return ret;
 }
@@ -1489,14 +1552,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
        return overruns;
 }
 
-/**
- * ring_buffer_iter_reset - reset an iterator
- * @iter: The iterator to reset
- *
- * Resets the iterator, so that it will start from the beginning
- * again.
- */
-void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+static void rb_iter_reset(struct ring_buffer_iter *iter)
 {
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 
@@ -1511,7 +1567,24 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
        if (iter->head)
                iter->read_stamp = cpu_buffer->read_stamp;
        else
-               iter->read_stamp = iter->head_page->time_stamp;
+               iter->read_stamp = iter->head_page->page->time_stamp;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+       struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       rb_iter_reset(iter);
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
 /**
@@ -1597,7 +1670,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        unsigned long flags;
        int nr_loops = 0;
 
-       spin_lock_irqsave(&cpu_buffer->lock, flags);
+       local_irq_save(flags);
+       __raw_spin_lock(&cpu_buffer->lock);
 
  again:
        /*
@@ -1606,8 +1680,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
         * a case where we will loop three times. There should be no
         * reason to loop four times (that I know of).
         */
-       if (unlikely(++nr_loops > 3)) {
-               RB_WARN_ON(cpu_buffer, 1);
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
                reader = NULL;
                goto out;
        }
@@ -1619,8 +1692,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
                goto out;
 
        /* Never should we have an index greater than the size */
-       RB_WARN_ON(cpu_buffer,
-                  cpu_buffer->reader_page->read > rb_page_size(reader));
+       if (RB_WARN_ON(cpu_buffer,
+                      cpu_buffer->reader_page->read > rb_page_size(reader)))
+               goto out;
 
        /* check if we caught up to the tail */
        reader = NULL;
@@ -1637,7 +1711,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page->list.prev = reader->list.prev;
 
        local_set(&cpu_buffer->reader_page->write, 0);
-       local_set(&cpu_buffer->reader_page->commit, 0);
+       local_set(&cpu_buffer->reader_page->page->commit, 0);
 
        /* Make the reader page now replace the head */
        reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -1659,7 +1733,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        goto again;
 
  out:
-       spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+       __raw_spin_unlock(&cpu_buffer->lock);
+       local_irq_restore(flags);
 
        return reader;
 }
@@ -1673,7 +1748,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
        reader = rb_get_reader_page(cpu_buffer);
 
        /* This function should not be called when buffer is empty */
-       BUG_ON(!reader);
+       if (RB_WARN_ON(cpu_buffer, !reader))
+               return;
 
        event = rb_reader_event(cpu_buffer);
 
@@ -1700,7 +1776,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
         * Check if we are at the end of the buffer.
         */
        if (iter->head >= rb_page_size(iter->head_page)) {
-               BUG_ON(iter->head_page == cpu_buffer->commit_page);
+               if (RB_WARN_ON(buffer,
+                              iter->head_page == cpu_buffer->commit_page))
+                       return;
                rb_inc_iter(iter);
                return;
        }
@@ -1713,8 +1791,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
         * This should not be called to advance the header if we are
         * at the tail of the buffer.
         */
-       BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
-              (iter->head + length > rb_commit_index(cpu_buffer)));
+       if (RB_WARN_ON(cpu_buffer,
+                      (iter->head_page == cpu_buffer->commit_page) &&
+                      (iter->head + length > rb_commit_index(cpu_buffer))))
+               return;
 
        rb_update_iter_read_stamp(iter, event);
 
@@ -1726,17 +1806,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
                rb_advance_iter(iter);
 }
 
-/**
- * ring_buffer_peek - peek at the next event to be read
- * @buffer: The ring buffer to read
- * @cpu: The cpu to peak at
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not consume the data.
- */
-struct ring_buffer_event *
-ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+static struct ring_buffer_event *
+rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
@@ -1757,10 +1828,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
         * can have.  Nesting 10 deep of interrupts is clearly
         * an anomaly.
         */
-       if (unlikely(++nr_loops > 10)) {
-               RB_WARN_ON(cpu_buffer, 1);
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
                return NULL;
-       }
 
        reader = rb_get_reader_page(cpu_buffer);
        if (!reader)
@@ -1798,16 +1867,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        return NULL;
 }
 
-/**
- * ring_buffer_iter_peek - peek at the next event to be read
- * @iter: The ring buffer iterator
- * @ts: The timestamp counter of this event.
- *
- * This will return the event that will be read next, but does
- * not increment the iterator.
- */
-struct ring_buffer_event *
-ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+static struct ring_buffer_event *
+rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 {
        struct ring_buffer *buffer;
        struct ring_buffer_per_cpu *cpu_buffer;
@@ -1829,10 +1890,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
         * can have. Nesting 10 deep of interrupts is clearly
         * an anomaly.
         */
-       if (unlikely(++nr_loops > 10)) {
-               RB_WARN_ON(cpu_buffer, 1);
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
                return NULL;
-       }
 
        if (rb_per_cpu_empty(cpu_buffer))
                return NULL;
@@ -1868,6 +1927,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        return NULL;
 }
 
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+       struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+       struct ring_buffer_event *event;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       event = rb_buffer_peek(buffer, cpu, ts);
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+       return event;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+       struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+       struct ring_buffer_event *event;
+       unsigned long flags;
+
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       event = rb_iter_peek(iter, ts);
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+       return event;
+}
+
 /**
  * ring_buffer_consume - return an event and consume it
  * @buffer: The ring buffer to get the next event from
@@ -1879,19 +1983,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 struct ring_buffer_event *
 ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
-       struct ring_buffer_per_cpu *cpu_buffer;
+       struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
        struct ring_buffer_event *event;
+       unsigned long flags;
 
        if (!cpu_isset(cpu, buffer->cpumask))
                return NULL;
 
-       event = ring_buffer_peek(buffer, cpu, ts);
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+       event = rb_buffer_peek(buffer, cpu, ts);
        if (!event)
-               return NULL;
+               goto out;
 
-       cpu_buffer = buffer->buffers[cpu];
        rb_advance_reader(cpu_buffer);
 
+ out:
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
        return event;
 }
 
@@ -1928,9 +2037,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
        atomic_inc(&cpu_buffer->record_disabled);
        synchronize_sched();
 
-       spin_lock_irqsave(&cpu_buffer->lock, flags);
-       ring_buffer_iter_reset(iter);
-       spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       __raw_spin_lock(&cpu_buffer->lock);
+       rb_iter_reset(iter);
+       __raw_spin_unlock(&cpu_buffer->lock);
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
        return iter;
 }
@@ -1962,12 +2073,17 @@ struct ring_buffer_event *
 ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 {
        struct ring_buffer_event *event;
+       struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+       unsigned long flags;
 
-       event = ring_buffer_iter_peek(iter, ts);
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       event = rb_iter_peek(iter, ts);
        if (!event)
-               return NULL;
+               goto out;
 
        rb_advance_iter(iter);
+ out:
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
        return event;
 }
@@ -1987,7 +2103,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->head_page
                = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
        local_set(&cpu_buffer->head_page->write, 0);
-       local_set(&cpu_buffer->head_page->commit, 0);
+       local_set(&cpu_buffer->head_page->page->commit, 0);
 
        cpu_buffer->head_page->read = 0;
 
@@ -1996,7 +2112,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
        local_set(&cpu_buffer->reader_page->write, 0);
-       local_set(&cpu_buffer->reader_page->commit, 0);
+       local_set(&cpu_buffer->reader_page->page->commit, 0);
        cpu_buffer->reader_page->read = 0;
 
        cpu_buffer->overrun = 0;
@@ -2016,11 +2132,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        if (!cpu_isset(cpu, buffer->cpumask))
                return;
 
-       spin_lock_irqsave(&cpu_buffer->lock, flags);
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+       __raw_spin_lock(&cpu_buffer->lock);
 
        rb_reset_cpu(cpu_buffer);
 
-       spin_unlock_irqrestore(&cpu_buffer->lock, flags);
+       __raw_spin_unlock(&cpu_buffer->lock);
+
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 
 /**
@@ -2118,16 +2238,178 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
        return 0;
 }
 
+static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
+                             struct buffer_data_page *bpage)
+{
+       struct ring_buffer_event *event;
+       unsigned long head;
+
+       __raw_spin_lock(&cpu_buffer->lock);
+       for (head = 0; head < local_read(&bpage->commit);
+            head += rb_event_length(event)) {
+
+               event = __rb_data_page_index(bpage, head);
+               if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
+                       return;
+               /* Only count data entries */
+               if (event->type != RINGBUF_TYPE_DATA)
+                       continue;
+               cpu_buffer->entries--;
+       }
+       __raw_spin_unlock(&cpu_buffer->lock);
+}
+
+/**
+ * ring_buffer_alloc_read_page - allocate a page to read from buffer
+ * @buffer: the buffer to allocate for.
+ *
+ * This function is used in conjunction with ring_buffer_read_page.
+ * When reading a full page from the ring buffer, these functions
+ * can be used to speed up the process. The calling function should
+ * allocate a few pages first with this function. Then when it
+ * needs to get pages from the ring buffer, it passes the result
+ * of this function into ring_buffer_read_page, which will swap
+ * the page that was allocated, with the read page of the buffer.
+ *
+ * Returns:
+ *  The page allocated, or NULL on error.
+ */
+void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
+{
+       unsigned long addr;
+       struct buffer_data_page *bpage;
+
+       addr = __get_free_page(GFP_KERNEL);
+       if (!addr)
+               return NULL;
+
+       bpage = (void *)addr;
+
+       return bpage;
+}
+
+/**
+ * ring_buffer_free_read_page - free an allocated read page
+ * @buffer: the buffer the page was allocate for
+ * @data: the page to free
+ *
+ * Free a page allocated from ring_buffer_alloc_read_page.
+ */
+void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
+{
+       free_page((unsigned long)data);
+}
+
+/**
+ * ring_buffer_read_page - extract a page from the ring buffer
+ * @buffer: buffer to extract from
+ * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @cpu: the cpu of the buffer to extract
+ * @full: should the extraction only happen when the page is full.
+ *
+ * This function will pull out a page from the ring buffer and consume it.
+ * @data_page must be the address of the variable that was returned
+ * from ring_buffer_alloc_read_page. This is because the page might be used
+ * to swap with a page in the ring buffer.
+ *
+ * for example:
+ *     rpage = ring_buffer_alloc_page(buffer);
+ *     if (!rpage)
+ *             return error;
+ *     ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *     if (ret)
+ *             process_page(rpage);
+ *
+ * When @full is set, the function will not return true unless
+ * the writer is off the reader page.
+ *
+ * Note: it is up to the calling functions to handle sleeps and wakeups.
+ *  The ring buffer can be used anywhere in the kernel and can not
+ *  blindly call wake_up. The layer that uses the ring buffer must be
+ *  responsible for that.
+ *
+ * Returns:
+ *  1 if data has been transferred
+ *  0 if no data has been transferred.
+ */
+int ring_buffer_read_page(struct ring_buffer *buffer,
+                           void **data_page, int cpu, int full)
+{
+       struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+       struct ring_buffer_event *event;
+       struct buffer_data_page *bpage;
+       unsigned long flags;
+       int ret = 0;
+
+       if (!data_page)
+               return 0;
+
+       bpage = *data_page;
+       if (!bpage)
+               return 0;
+
+       spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+       /*
+        * rb_buffer_peek will get the next ring buffer if
+        * the current reader page is empty.
+        */
+       event = rb_buffer_peek(buffer, cpu, NULL);
+       if (!event)
+               goto out;
+
+       /* check for data */
+       if (!local_read(&cpu_buffer->reader_page->page->commit))
+               goto out;
+       /*
+        * If the writer is already off of the read page, then simply
+        * switch the read page with the given page. Otherwise
+        * we need to copy the data from the reader to the writer.
+        */
+       if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
+               unsigned int read = cpu_buffer->reader_page->read;
+
+               if (full)
+                       goto out;
+               /* The writer is still on the reader page, we must copy */
+               bpage = cpu_buffer->reader_page->page;
+               memcpy(bpage->data,
+                      cpu_buffer->reader_page->page->data + read,
+                      local_read(&bpage->commit) - read);
+
+               /* consume what was read */
+               cpu_buffer->reader_page += read;
+
+       } else {
+               /* swap the pages */
+               rb_init_page(bpage);
+               bpage = cpu_buffer->reader_page->page;
+               cpu_buffer->reader_page->page = *data_page;
+               cpu_buffer->reader_page->read = 0;
+               *data_page = bpage;
+       }
+       ret = 1;
+
+       /* update the entry counter */
+       rb_remove_entries(cpu_buffer, bpage);
+ out:
+       spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+       return ret;
+}
+
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
               size_t cnt, loff_t *ppos)
 {
-       int *p = filp->private_data;
+       long *p = filp->private_data;
        char buf[64];
        int r;
 
-       /* !ring_buffers_off == tracing_on */
-       r = sprintf(buf, "%d\n", !*p);
+       if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
+               r = sprintf(buf, "permanently disabled\n");
+       else
+               r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
 
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
@@ -2136,7 +2418,7 @@ static ssize_t
 rb_simple_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
 {
-       int *p = filp->private_data;
+       long *p = filp->private_data;
        char buf[64];
        long val;
        int ret;
@@ -2153,8 +2435,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
        if (ret < 0)
                return ret;
 
-       /* !ring_buffers_off == tracing_on */
-       *p = !val;
+       if (val)
+               set_bit(RB_BUFFERS_ON_BIT, p);
+       else
+               clear_bit(RB_BUFFERS_ON_BIT, p);
 
        (*ppos)++;
 
@@ -2176,7 +2460,7 @@ static __init int rb_init_debugfs(void)
        d_tracer = tracing_init_dentry();
 
        entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-                                   &ring_buffers_off, &rb_simple_fops);
+                                   &ring_buffer_flags, &rb_simple_fops);
        if (!entry)
                pr_warning("Could not create debugfs 'tracing_on' entry\n");
 
index d86e3252f3000024cfaf31c63ffbee765dafac53..6adf660fc8163c83cafb2fde2bea12fcf80a7bd5 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/kprobes.h>
+#include <linux/seq_file.h>
 #include <linux/writeback.h>
 
 #include <linux/stacktrace.h>
 unsigned long __read_mostly    tracing_max_latency = (cycle_t)ULONG_MAX;
 unsigned long __read_mostly    tracing_thresh;
 
+/*
+ * We need to change this state when a selftest is running.
+ * A selftest will lurk into the ring-buffer to count the
+ * entries inserted during the selftest although some concurrent
+ * insertions into the ring-buffer such as ftrace_printk could occurred
+ * at the same time, giving false positive or negative results.
+ */
+static bool __read_mostly tracing_selftest_running;
+
+/* For tracers that don't implement custom flags */
+static struct tracer_opt dummy_tracer_opt[] = {
+       { }
+};
+
+static struct tracer_flags dummy_tracer_flags = {
+       .val = 0,
+       .opts = dummy_tracer_opt
+};
+
+static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+{
+       return 0;
+}
+
+/*
+ * Kill all tracing for good (never come back).
+ * It is initialized to 1 but will turn to zero if the initialization
+ * of the tracer is successful. But that is the only place that sets
+ * this back to zero.
+ */
+int tracing_disabled = 1;
+
 static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
 static inline void ftrace_disable_cpu(void)
@@ -62,7 +95,36 @@ static cpumask_t __read_mostly               tracing_buffer_mask;
 #define for_each_tracing_cpu(cpu)      \
        for_each_cpu_mask(cpu, tracing_buffer_mask)
 
-static int tracing_disabled = 1;
+/*
+ * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
+ *
+ * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
+ * is set, then ftrace_dump is called. This will output the contents
+ * of the ftrace buffers to the console.  This is very useful for
+ * capturing traces that lead to crashes and outputing it to a
+ * serial console.
+ *
+ * It is default off, but you can enable it with either specifying
+ * "ftrace_dump_on_oops" in the kernel command line, or setting
+ * /proc/sys/kernel/ftrace_dump_on_oops to true.
+ */
+int ftrace_dump_on_oops;
+
+static int tracing_set_tracer(char *buf);
+
+static int __init set_ftrace(char *str)
+{
+       tracing_set_tracer(str);
+       return 1;
+}
+__setup("ftrace", set_ftrace);
+
+static int __init set_ftrace_dump_on_oops(char *str)
+{
+       ftrace_dump_on_oops = 1;
+       return 1;
+}
+__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
 long
 ns2usecs(cycle_t nsec)
@@ -112,6 +174,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
 /* tracer_enabled is used to toggle activation of a tracer */
 static int                     tracer_enabled = 1;
 
+/**
+ * tracing_is_enabled - return tracer_enabled status
+ *
+ * This function is used by other tracers to know the status
+ * of the tracer_enabled flag.  Tracers may use this function
+ * to know if it should enable their features when starting
+ * up. See irqsoff tracer for an example (start_irqsoff_tracer).
+ */
+int tracing_is_enabled(void)
+{
+       return tracer_enabled;
+}
+
 /* function tracing enabled */
 int                            ftrace_function_enabled;
 
@@ -153,8 +228,9 @@ static DEFINE_MUTEX(trace_types_lock);
 /* trace_wait is a waitqueue for tasks blocked on trace_poll */
 static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
-/* trace_flags holds iter_ctrl options */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
+/* trace_flags holds trace_options default values */
+unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
+       TRACE_ITER_ANNOTATE;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -193,13 +269,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
        return nsecs / 1000;
 }
 
-/*
- * TRACE_ITER_SYM_MASK masks the options in trace_flags that
- * control the output of kernel symbols.
- */
-#define TRACE_ITER_SYM_MASK \
-       (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
-
 /* These must match the bit postions in trace_iterator_flags */
 static const char *trace_options[] = {
        "print-parent",
@@ -213,6 +282,11 @@ static const char *trace_options[] = {
        "stacktrace",
        "sched-tree",
        "ftrace_printk",
+       "ftrace_preempt",
+       "branch",
+       "annotate",
+       "userstacktrace",
+       "sym-userobj",
        NULL
 };
 
@@ -359,6 +433,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
        return trace_seq_putmem(s, hex, j);
 }
 
+static int
+trace_seq_path(struct trace_seq *s, struct path *path)
+{
+       unsigned char *p;
+
+       if (s->len >= (PAGE_SIZE - 1))
+               return 0;
+       p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+       if (!IS_ERR(p)) {
+               p = mangle_path(s->buffer + s->len, p, "\n");
+               if (p) {
+                       s->len = p - s->buffer;
+                       return 1;
+               }
+       } else {
+               s->buffer[s->len++] = '?';
+               return 1;
+       }
+
+       return 0;
+}
+
 static void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -470,7 +566,17 @@ int register_tracer(struct tracer *type)
                return -1;
        }
 
+       /*
+        * When this gets called we hold the BKL which means that
+        * preemption is disabled. Various trace selftests however
+        * need to disable and enable preemption for successful tests.
+        * So we drop the BKL here and grab it after the tests again.
+        */
+       unlock_kernel();
        mutex_lock(&trace_types_lock);
+
+       tracing_selftest_running = true;
+
        for (t = trace_types; t; t = t->next) {
                if (strcmp(type->name, t->name) == 0) {
                        /* already found */
@@ -481,12 +587,20 @@ int register_tracer(struct tracer *type)
                }
        }
 
+       if (!type->set_flag)
+               type->set_flag = &dummy_set_flag;
+       if (!type->flags)
+               type->flags = &dummy_tracer_flags;
+       else
+               if (!type->flags->opts)
+                       type->flags->opts = dummy_tracer_opt;
+
 #ifdef CONFIG_FTRACE_STARTUP_TEST
        if (type->selftest) {
                struct tracer *saved_tracer = current_trace;
                struct trace_array *tr = &global_trace;
-               int saved_ctrl = tr->ctrl;
                int i;
+
                /*
                 * Run a selftest on this tracer.
                 * Here we reset the trace buffer, and set the current
@@ -494,25 +608,23 @@ int register_tracer(struct tracer *type)
                 * internal tracing to verify that everything is in order.
                 * If we fail, we do not register this tracer.
                 */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                        tracing_reset(tr, i);
-               }
+
                current_trace = type;
-               tr->ctrl = 0;
                /* the test is responsible for initializing and enabling */
                pr_info("Testing tracer %s: ", type->name);
                ret = type->selftest(type, tr);
                /* the test is responsible for resetting too */
                current_trace = saved_tracer;
-               tr->ctrl = saved_ctrl;
                if (ret) {
                        printk(KERN_CONT "FAILED!\n");
                        goto out;
                }
                /* Only reset on passing, to avoid touching corrupted buffers */
-               for_each_tracing_cpu(i) {
+               for_each_tracing_cpu(i)
                        tracing_reset(tr, i);
-               }
+
                printk(KERN_CONT "PASSED\n");
        }
 #endif
@@ -524,7 +636,9 @@ int register_tracer(struct tracer *type)
                max_tracer_type_len = len;
 
  out:
+       tracing_selftest_running = false;
        mutex_unlock(&trace_types_lock);
+       lock_kernel();
 
        return ret;
 }
@@ -581,6 +695,91 @@ static void trace_init_cmdlines(void)
        cmdline_idx = 0;
 }
 
+static int trace_stop_count;
+static DEFINE_SPINLOCK(tracing_start_lock);
+
+/**
+ * ftrace_off_permanent - disable all ftrace code permanently
+ *
+ * This should only be called when a serious anomally has
+ * been detected.  This will turn off the function tracing,
+ * ring buffers, and other tracing utilites. It takes no
+ * locks and can be called from any context.
+ */
+void ftrace_off_permanent(void)
+{
+       tracing_disabled = 1;
+       ftrace_stop();
+       tracing_off_permanent();
+}
+
+/**
+ * tracing_start - quick start of the tracer
+ *
+ * If tracing is enabled but was stopped by tracing_stop,
+ * this will start the tracer back up.
+ */
+void tracing_start(void)
+{
+       struct ring_buffer *buffer;
+       unsigned long flags;
+
+       if (tracing_disabled)
+               return;
+
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (--trace_stop_count)
+               goto out;
+
+       if (trace_stop_count < 0) {
+               /* Someone screwed up their debugging */
+               WARN_ON_ONCE(1);
+               trace_stop_count = 0;
+               goto out;
+       }
+
+
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_enable(buffer);
+
+       ftrace_start();
+ out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
+/**
+ * tracing_stop - quick stop of the tracer
+ *
+ * Light weight way to stop tracing. Use in conjunction with
+ * tracing_start.
+ */
+void tracing_stop(void)
+{
+       struct ring_buffer *buffer;
+       unsigned long flags;
+
+       ftrace_stop();
+       spin_lock_irqsave(&tracing_start_lock, flags);
+       if (trace_stop_count++)
+               goto out;
+
+       buffer = global_trace.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+
+       buffer = max_tr.buffer;
+       if (buffer)
+               ring_buffer_record_disable(buffer);
+
+ out:
+       spin_unlock_irqrestore(&tracing_start_lock, flags);
+}
+
 void trace_stop_cmdline_recording(void);
 
 static void trace_save_cmdline(struct task_struct *tsk)
@@ -618,7 +817,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
        spin_unlock(&trace_cmdline_lock);
 }
 
-static char *trace_find_cmdline(int pid)
+char *trace_find_cmdline(int pid)
 {
        char *cmdline = "<...>";
        unsigned map;
@@ -655,6 +854,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
        entry->preempt_count            = pc & 0xff;
        entry->pid                      = (tsk) ? tsk->pid : 0;
+       entry->tgid                     = (tsk) ? tsk->tgid : 0;
        entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
                (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -691,6 +891,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void __trace_graph_entry(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ent *trace,
+                               unsigned long flags,
+                               int pc)
+{
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ent_entry *entry;
+       unsigned long irq_flags;
+
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_ENT;
+       entry->graph_ent                        = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+
+static void __trace_graph_return(struct trace_array *tr,
+                               struct trace_array_cpu *data,
+                               struct ftrace_graph_ret *trace,
+                               unsigned long flags,
+                               int pc)
+{
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ret_entry *entry;
+       unsigned long irq_flags;
+
+       if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
+               return;
+
+       event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type                 = TRACE_GRAPH_RET;
+       entry->ret                              = *trace;
+       ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+}
+#endif
+
 void
 ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -742,6 +992,46 @@ void __trace_stack(struct trace_array *tr,
        ftrace_trace_stack(tr, data, flags, skip, preempt_count());
 }
 
+static void ftrace_trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags, int pc)
+{
+#ifdef CONFIG_STACKTRACE
+       struct ring_buffer_event *event;
+       struct userstack_entry *entry;
+       struct stack_trace trace;
+       unsigned long irq_flags;
+
+       if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+               return;
+
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type         = TRACE_USER_STACK;
+
+       memset(&entry->caller, 0, sizeof(entry->caller));
+
+       trace.nr_entries        = 0;
+       trace.max_entries       = FTRACE_STACK_ENTRIES;
+       trace.skip              = 0;
+       trace.entries           = entry->caller;
+
+       save_stack_trace_user(&trace);
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+#endif
+}
+
+void __trace_userstack(struct trace_array *tr,
+                  struct trace_array_cpu *data,
+                  unsigned long flags)
+{
+       ftrace_trace_userstack(tr, data, flags, preempt_count());
+}
+
 static void
 ftrace_trace_special(void *__tr, void *__data,
                     unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -765,6 +1055,7 @@ ftrace_trace_special(void *__tr, void *__data,
        entry->arg3                     = arg3;
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, irq_flags, 4, pc);
+       ftrace_trace_userstack(tr, data, irq_flags, pc);
 
        trace_wake_up();
 }
@@ -803,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
        entry->next_cpu = task_cpu(next);
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, flags, 5, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
 }
 
 void
@@ -832,6 +1124,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry->next_cpu                 = task_cpu(wakee);
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
        ftrace_trace_stack(tr, data, flags, 6, pc);
+       ftrace_trace_userstack(tr, data, flags, pc);
 
        trace_wake_up();
 }
@@ -841,26 +1134,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
+       unsigned long flags;
        int cpu;
        int pc;
 
-       if (tracing_disabled || !tr->ctrl)
+       if (tracing_disabled)
                return;
 
        pc = preempt_count();
-       preempt_disable_notrace();
+       local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
 
-       if (likely(!atomic_read(&data->disabled)))
+       if (likely(atomic_inc_return(&data->disabled) == 1))
                ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
 
-       preempt_enable_notrace();
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
 }
 
 #ifdef CONFIG_FUNCTION_TRACER
 static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 {
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
@@ -873,8 +1168,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
                return;
 
        pc = preempt_count();
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
        local_save_flags(flags);
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
@@ -884,11 +1178,96 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
                trace_function(tr, data, ip, parent_ip, flags, pc);
 
        atomic_dec(&data->disabled);
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+
+       if (unlikely(!ftrace_function_enabled))
+               return;
+
+       /*
+        * Need to use raw, since this must be called before the
+        * recursive protection is performed.
+        */
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               trace_function(tr, data, ip, parent_ip, flags, pc);
+       }
+
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+int trace_graph_entry(struct ftrace_graph_ent *trace)
+{
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+
+       if (!ftrace_trace_task(current))
+               return 0;
+
+       if (!ftrace_graph_addr(trace->func))
+               return 0;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_entry(tr, data, trace, flags, pc);
+       }
+       /* Only do the atomic if it is not already set */
+       if (!test_tsk_trace_graph(current))
+               set_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
+
+       return 1;
+}
+
+void trace_graph_return(struct ftrace_graph_ret *trace)
+{
+       struct trace_array *tr = &global_trace;
+       struct trace_array_cpu *data;
+       unsigned long flags;
+       long disabled;
+       int cpu;
+       int pc;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       data = tr->data[cpu];
+       disabled = atomic_inc_return(&data->disabled);
+       if (likely(disabled == 1)) {
+               pc = preempt_count();
+               __trace_graph_return(tr, data, trace, flags, pc);
+       }
+       if (!trace->depth)
+               clear_tsk_trace_graph(current);
+       atomic_dec(&data->disabled);
+       local_irq_restore(flags);
 }
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 static struct ftrace_ops trace_ops __read_mostly =
 {
@@ -898,9 +1277,14 @@ static struct ftrace_ops trace_ops __read_mostly =
 void tracing_start_function_trace(void)
 {
        ftrace_function_enabled = 0;
+
+       if (trace_flags & TRACE_ITER_PREEMPTONLY)
+               trace_ops.func = function_trace_call_preempt_only;
+       else
+               trace_ops.func = function_trace_call;
+
        register_ftrace_function(&trace_ops);
-       if (tracer_enabled)
-               ftrace_function_enabled = 1;
+       ftrace_function_enabled = 1;
 }
 
 void tracing_stop_function_trace(void)
@@ -912,6 +1296,7 @@ void tracing_stop_function_trace(void)
 
 enum trace_file_type {
        TRACE_FILE_LAT_FMT      = 1,
+       TRACE_FILE_ANNOTATE     = 2,
 };
 
 static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@ -1047,10 +1432,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
        atomic_inc(&trace_record_cmdline_disabled);
 
-       /* let the tracer grab locks here if needed */
-       if (current_trace->start)
-               current_trace->start(iter);
-
        if (*pos != iter->pos) {
                iter->ent = NULL;
                iter->cpu = 0;
@@ -1077,14 +1458,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 static void s_stop(struct seq_file *m, void *p)
 {
-       struct trace_iterator *iter = m->private;
-
        atomic_dec(&trace_record_cmdline_disabled);
-
-       /* let the tracer release locks here if needed */
-       if (current_trace && current_trace == iter->trace && iter->trace->stop)
-               iter->trace->stop(iter);
-
        mutex_unlock(&trace_types_lock);
 }
 
@@ -1143,7 +1517,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
 # define IP_FMT "%016lx"
 #endif
 
-static int
+int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 {
        int ret;
@@ -1164,6 +1538,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
        return ret;
 }
 
+static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+                                   unsigned long ip, unsigned long sym_flags)
+{
+       struct file *file = NULL;
+       unsigned long vmstart = 0;
+       int ret = 1;
+
+       if (mm) {
+               const struct vm_area_struct *vma;
+
+               down_read(&mm->mmap_sem);
+               vma = find_vma(mm, ip);
+               if (vma) {
+                       file = vma->vm_file;
+                       vmstart = vma->vm_start;
+               }
+               if (file) {
+                       ret = trace_seq_path(s, &file->f_path);
+                       if (ret)
+                               ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
+               }
+               up_read(&mm->mmap_sem);
+       }
+       if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+               ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+       return ret;
+}
+
+static int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+                     unsigned long sym_flags)
+{
+       struct mm_struct *mm = NULL;
+       int ret = 1;
+       unsigned int i;
+
+       if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+               struct task_struct *task;
+               /*
+                * we do the lookup on the thread group leader,
+                * since individual threads might have already quit!
+                */
+               rcu_read_lock();
+               task = find_task_by_vpid(entry->ent.tgid);
+               if (task)
+                       mm = get_task_mm(task);
+               rcu_read_unlock();
+       }
+
+       for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+               unsigned long ip = entry->caller[i];
+
+               if (ip == ULONG_MAX || !ret)
+                       break;
+               if (i && ret)
+                       ret = trace_seq_puts(s, " <- ");
+               if (!ip) {
+                       if (ret)
+                               ret = trace_seq_puts(s, "??");
+                       continue;
+               }
+               if (!ret)
+                       break;
+               if (ret)
+                       ret = seq_print_user_ip(s, mm, ip, sym_flags);
+       }
+
+       if (mm)
+               mmput(mm);
+       return ret;
+}
+
 static void print_lat_help_header(struct seq_file *m)
 {
        seq_puts(m, "#                  _------=> CPU#            \n");
@@ -1338,6 +1784,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
                trace_seq_putc(s, '\n');
 }
 
+static void test_cpu_buff_start(struct trace_iterator *iter)
+{
+       struct trace_seq *s = &iter->seq;
+
+       if (!(trace_flags & TRACE_ITER_ANNOTATE))
+               return;
+
+       if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
+               return;
+
+       if (cpu_isset(iter->cpu, iter->started))
+               return;
+
+       cpu_set(iter->cpu, iter->started);
+       trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+}
+
 static enum print_line_t
 print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 {
@@ -1357,6 +1820,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
        if (entry->type == TRACE_CONT)
                return TRACE_TYPE_HANDLED;
 
+       test_cpu_buff_start(iter);
+
        next_entry = find_next_entry(iter, NULL, &next_ts);
        if (!next_entry)
                next_ts = iter->ts;
@@ -1448,6 +1913,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
                        trace_seq_print_cont(s, iter);
                break;
        }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+
+               trace_assign_type(field, entry);
+
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+
+               trace_assign_type(field, entry);
+
+               seq_print_userip_objs(field, s, sym_flags);
+               trace_seq_putc(s, '\n');
+               break;
+       }
        default:
                trace_seq_printf(s, "Unknown type %d\n", entry->type);
        }
@@ -1472,6 +1958,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
        if (entry->type == TRACE_CONT)
                return TRACE_TYPE_HANDLED;
 
+       test_cpu_buff_start(iter);
+
        comm = trace_find_cmdline(iter->ent->pid);
 
        t = ns2usecs(iter->ts);
@@ -1581,6 +2069,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
                        trace_seq_print_cont(s, iter);
                break;
        }
+       case TRACE_GRAPH_RET: {
+               return print_graph_function(iter);
+       }
+       case TRACE_GRAPH_ENT: {
+               return print_graph_function(iter);
+       }
+       case TRACE_BRANCH: {
+               struct trace_branch *field;
+
+               trace_assign_type(field, entry);
+
+               trace_seq_printf(s, "[%s] %s:%s:%d\n",
+                                field->correct ? "  ok  " : " MISS ",
+                                field->func,
+                                field->file,
+                                field->line);
+               break;
+       }
+       case TRACE_USER_STACK: {
+               struct userstack_entry *field;
+
+               trace_assign_type(field, entry);
+
+               ret = seq_print_userip_objs(field, s, sym_flags);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               ret = trace_seq_putc(s, '\n');
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               break;
+       }
        }
        return TRACE_TYPE_HANDLED;
 }
@@ -1640,6 +2159,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
 
@@ -1728,6 +2248,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
 
@@ -1782,6 +2303,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
                break;
        }
        case TRACE_SPECIAL:
+       case TRACE_USER_STACK:
        case TRACE_STACK: {
                struct special_entry *field;
 
@@ -1847,7 +2369,9 @@ static int s_show(struct seq_file *m, void *v)
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
                }
-               if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+               if (iter->trace && iter->trace->print_header)
+                       iter->trace->print_header(m);
+               else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
                        /* print nothing if the buffers are empty */
                        if (trace_empty(iter))
                                return 0;
@@ -1899,6 +2423,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
        iter->trace = current_trace;
        iter->pos = -1;
 
+       /* Notify the tracer early; before we stop tracing. */
+       if (iter->trace && iter->trace->open)
+                       iter->trace->open(iter);
+
+       /* Annotate start of buffers if we had overruns */
+       if (ring_buffer_overruns(iter->tr->buffer))
+               iter->iter_flags |= TRACE_FILE_ANNOTATE;
+
+
        for_each_tracing_cpu(cpu) {
 
                iter->buffer_iter[cpu] =
@@ -1917,13 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
        m->private = iter;
 
        /* stop the trace while dumping */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 0;
-               ftrace_function_enabled = 0;
-       }
-
-       if (iter->trace && iter->trace->open)
-                       iter->trace->open(iter);
+       tracing_stop();
 
        mutex_unlock(&trace_types_lock);
 
@@ -1966,14 +2493,7 @@ int tracing_release(struct inode *inode, struct file *file)
                iter->trace->close(iter);
 
        /* reenable tracing if it was previously enabled */
-       if (iter->tr->ctrl) {
-               tracer_enabled = 1;
-               /*
-                * It is safe to enable function tracing even if it
-                * isn't used
-                */
-               ftrace_function_enabled = 1;
-       }
+       tracing_start();
        mutex_unlock(&trace_types_lock);
 
        seq_release(inode, file);
@@ -2126,7 +2646,7 @@ tracing_cpumask_read(struct file *filp, char __user *ubuf,
 
        mutex_lock(&tracing_cpumask_update_lock);
 
-       len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+       len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
        if (count - len < 2) {
                count = -EINVAL;
                goto out_err;
@@ -2147,11 +2667,11 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
        int err, cpu;
 
        mutex_lock(&tracing_cpumask_update_lock);
-       err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+       err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
        if (err)
                goto err_unlock;
 
-       raw_local_irq_disable();
+       local_irq_disable();
        __raw_spin_lock(&ftrace_max_lock);
        for_each_tracing_cpu(cpu) {
                /*
@@ -2168,7 +2688,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
                }
        }
        __raw_spin_unlock(&ftrace_max_lock);
-       raw_local_irq_enable();
+       local_irq_enable();
 
        tracing_cpumask = tracing_cpumask_new;
 
@@ -2189,13 +2709,16 @@ static struct file_operations tracing_cpumask_fops = {
 };
 
 static ssize_t
-tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
+tracing_trace_options_read(struct file *filp, char __user *ubuf,
                       size_t cnt, loff_t *ppos)
 {
+       int i;
        char *buf;
        int r = 0;
        int len = 0;
-       int i;
+       u32 tracer_flags = current_trace->flags->val;
+       struct tracer_opt *trace_opts = current_trace->flags->opts;
+
 
        /* calulate max size */
        for (i = 0; trace_options[i]; i++) {
@@ -2203,6 +2726,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
                len += 3; /* "no" and space */
        }
 
+       /*
+        * Increase the size with names of options specific
+        * of the current tracer.
+        */
+       for (i = 0; trace_opts[i].name; i++) {
+               len += strlen(trace_opts[i].name);
+               len += 3; /* "no" and space */
+       }
+
        /* +2 for \n and \0 */
        buf = kmalloc(len + 2, GFP_KERNEL);
        if (!buf)
@@ -2215,6 +2747,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
                        r += sprintf(buf + r, "no%s ", trace_options[i]);
        }
 
+       for (i = 0; trace_opts[i].name; i++) {
+               if (tracer_flags & trace_opts[i].bit)
+                       r += sprintf(buf + r, "%s ",
+                               trace_opts[i].name);
+               else
+                       r += sprintf(buf + r, "no%s ",
+                               trace_opts[i].name);
+       }
+
        r += sprintf(buf + r, "\n");
        WARN_ON(r >= len + 2);
 
@@ -2225,13 +2766,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
        return r;
 }
 
+/* Try to assign a tracer specific option */
+static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+{
+       struct tracer_flags *trace_flags = trace->flags;
+       struct tracer_opt *opts = NULL;
+       int ret = 0, i = 0;
+       int len;
+
+       for (i = 0; trace_flags->opts[i].name; i++) {
+               opts = &trace_flags->opts[i];
+               len = strlen(opts->name);
+
+               if (strncmp(cmp, opts->name, len) == 0) {
+                       ret = trace->set_flag(trace_flags->val,
+                               opts->bit, !neg);
+                       break;
+               }
+       }
+       /* Not found */
+       if (!trace_flags->opts[i].name)
+               return -EINVAL;
+
+       /* Refused to handle */
+       if (ret)
+               return ret;
+
+       if (neg)
+               trace_flags->val &= ~opts->bit;
+       else
+               trace_flags->val |= opts->bit;
+
+       return 0;
+}
+
 static ssize_t
-tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
+tracing_trace_options_write(struct file *filp, const char __user *ubuf,
                        size_t cnt, loff_t *ppos)
 {
        char buf[64];
        char *cmp = buf;
        int neg = 0;
+       int ret;
        int i;
 
        if (cnt >= sizeof(buf))
@@ -2258,11 +2834,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
                        break;
                }
        }
-       /*
-        * If no option could be set, return an error:
-        */
-       if (!trace_options[i])
-               return -EINVAL;
+
+       /* If no option could be set, test the specific tracer options */
+       if (!trace_options[i]) {
+               ret = set_tracer_option(current_trace, cmp, neg);
+               if (ret)
+                       return ret;
+       }
 
        filp->f_pos += cnt;
 
@@ -2271,8 +2849,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
 
 static struct file_operations tracing_iter_fops = {
        .open           = tracing_open_generic,
-       .read           = tracing_iter_ctrl_read,
-       .write          = tracing_iter_ctrl_write,
+       .read           = tracing_trace_options_read,
+       .write          = tracing_trace_options_write,
 };
 
 static const char readme_msg[] =
@@ -2286,9 +2864,9 @@ static const char readme_msg[] =
        "# echo sched_switch > /debug/tracing/current_tracer\n"
        "# cat /debug/tracing/current_tracer\n"
        "sched_switch\n"
-       "# cat /debug/tracing/iter_ctrl\n"
+       "# cat /debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
-       "# echo print-parent > /debug/tracing/iter_ctrl\n"
+       "# echo print-parent > /debug/tracing/trace_options\n"
        "# echo 1 > /debug/tracing/tracing_enabled\n"
        "# cat /debug/tracing/trace > /tmp/trace.txt\n"
        "echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2311,11 +2889,10 @@ static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
 {
-       struct trace_array *tr = filp->private_data;
        char buf[64];
        int r;
 
-       r = sprintf(buf, "%ld\n", tr->ctrl);
+       r = sprintf(buf, "%u\n", tracer_enabled);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
@@ -2343,16 +2920,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
        val = !!val;
 
        mutex_lock(&trace_types_lock);
-       if (tr->ctrl ^ val) {
-               if (val)
+       if (tracer_enabled ^ val) {
+               if (val) {
                        tracer_enabled = 1;
-               else
+                       if (current_trace->start)
+                               current_trace->start(tr);
+                       tracing_start();
+               } else {
                        tracer_enabled = 0;
-
-               tr->ctrl = val;
-
-               if (current_trace && current_trace->ctrl_update)
-                       current_trace->ctrl_update(tr);
+                       tracing_stop();
+                       if (current_trace->stop)
+                               current_trace->stop(tr);
+               }
        }
        mutex_unlock(&trace_types_lock);
 
@@ -2378,29 +2957,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static ssize_t
-tracing_set_trace_write(struct file *filp, const char __user *ubuf,
-                       size_t cnt, loff_t *ppos)
+static int tracing_set_tracer(char *buf)
 {
        struct trace_array *tr = &global_trace;
        struct tracer *t;
-       char buf[max_tracer_type_len+1];
-       int i;
-       size_t ret;
-
-       ret = cnt;
-
-       if (cnt > max_tracer_type_len)
-               cnt = max_tracer_type_len;
-
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-
-       buf[cnt] = 0;
-
-       /* strip ending whitespace. */
-       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
-               buf[i] = 0;
+       int ret = 0;
 
        mutex_lock(&trace_types_lock);
        for (t = trace_types; t; t = t->next) {
@@ -2414,18 +2975,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
        if (t == current_trace)
                goto out;
 
+       trace_branch_disable();
        if (current_trace && current_trace->reset)
                current_trace->reset(tr);
 
        current_trace = t;
-       if (t->init)
-               t->init(tr);
+       if (t->init) {
+               ret = t->init(tr);
+               if (ret)
+                       goto out;
+       }
 
+       trace_branch_enable(tr);
  out:
        mutex_unlock(&trace_types_lock);
 
-       if (ret > 0)
-               filp->f_pos += ret;
+       return ret;
+}
+
+static ssize_t
+tracing_set_trace_write(struct file *filp, const char __user *ubuf,
+                       size_t cnt, loff_t *ppos)
+{
+       char buf[max_tracer_type_len+1];
+       int i;
+       size_t ret;
+       int err;
+
+       ret = cnt;
+
+       if (cnt > max_tracer_type_len)
+               cnt = max_tracer_type_len;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       /* strip ending whitespace. */
+       for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
+               buf[i] = 0;
+
+       err = tracing_set_tracer(buf);
+       if (err)
+               return err;
+
+       filp->f_pos += ret;
 
        return ret;
 }
@@ -2492,6 +3087,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
                return -ENOMEM;
 
        mutex_lock(&trace_types_lock);
+
+       /* trace pipe does not show start of buffer */
+       cpus_setall(iter->started);
+
        iter->tr = &global_trace;
        iter->trace = current_trace;
        filp->private_data = iter;
@@ -2667,7 +3266,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
        char buf[64];
        int r;
 
-       r = sprintf(buf, "%lu\n", tr->entries);
+       r = sprintf(buf, "%lu\n", tr->entries >> 10);
        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
@@ -2678,7 +3277,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
        unsigned long val;
        char buf[64];
        int ret, cpu;
-       struct trace_array *tr = filp->private_data;
 
        if (cnt >= sizeof(buf))
                return -EINVAL;
@@ -2698,12 +3296,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 
        mutex_lock(&trace_types_lock);
 
-       if (tr->ctrl) {
-               cnt = -EBUSY;
-               pr_info("ftrace: please disable tracing"
-                       " before modifying buffer size\n");
-               goto out;
-       }
+       tracing_stop();
 
        /* disable all cpu buffers */
        for_each_tracing_cpu(cpu) {
@@ -2713,6 +3306,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
                        atomic_inc(&max_tr.data[cpu]->disabled);
        }
 
+       /* value is in KB */
+       val <<= 10;
+
        if (val != global_trace.entries) {
                ret = ring_buffer_resize(global_trace.buffer, val);
                if (ret < 0) {
@@ -2751,6 +3347,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
                        atomic_dec(&max_tr.data[cpu]->disabled);
        }
 
+       tracing_start();
        max_tr.entries = global_trace.entries;
        mutex_unlock(&trace_types_lock);
 
@@ -2762,7 +3359,7 @@ static int mark_printk(const char *fmt, ...)
        int ret;
        va_list args;
        va_start(args, fmt);
-       ret = trace_vprintk(0, fmt, args);
+       ret = trace_vprintk(0, -1, fmt, args);
        va_end(args);
        return ret;
 }
@@ -2773,9 +3370,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 {
        char *buf;
        char *end;
-       struct trace_array *tr = &global_trace;
 
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled)
                return -EINVAL;
 
        if (cnt > TRACE_BUF_SIZE)
@@ -2841,22 +3437,38 @@ static struct file_operations tracing_mark_fops = {
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+int __weak ftrace_arch_read_dyn_info(char *buf, int size)
+{
+       return 0;
+}
+
 static ssize_t
-tracing_read_long(struct file *filp, char __user *ubuf,
+tracing_read_dyn_info(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
 {
+       static char ftrace_dyn_info_buffer[1024];
+       static DEFINE_MUTEX(dyn_info_mutex);
        unsigned long *p = filp->private_data;
-       char buf[64];
+       char *buf = ftrace_dyn_info_buffer;
+       int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
        int r;
 
-       r = sprintf(buf, "%ld\n", *p);
+       mutex_lock(&dyn_info_mutex);
+       r = sprintf(buf, "%ld ", *p);
 
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+       r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+       buf[r++] = '\n';
+
+       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+
+       mutex_unlock(&dyn_info_mutex);
+
+       return r;
 }
 
-static struct file_operations tracing_read_long_fops = {
+static struct file_operations tracing_dyn_info_fops = {
        .open           = tracing_open_generic,
-       .read           = tracing_read_long,
+       .read           = tracing_read_dyn_info,
 };
 #endif
 
@@ -2897,10 +3509,10 @@ static __init int tracer_init_debugfs(void)
        if (!entry)
                pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
 
-       entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
+       entry = debugfs_create_file("trace_options", 0644, d_tracer,
                                    NULL, &tracing_iter_fops);
        if (!entry)
-               pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
+               pr_warning("Could not create debugfs 'trace_options' entry\n");
 
        entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
                                    NULL, &tracing_cpumask_fops);
@@ -2950,11 +3562,11 @@ static __init int tracer_init_debugfs(void)
                pr_warning("Could not create debugfs "
                           "'trace_pipe' entry\n");
 
-       entry = debugfs_create_file("trace_entries", 0644, d_tracer,
+       entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
                                    &global_trace, &tracing_entries_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
-                          "'trace_entries' entry\n");
+                          "'buffer_size_kb' entry\n");
 
        entry = debugfs_create_file("trace_marker", 0220, d_tracer,
                                    NULL, &tracing_mark_fops);
@@ -2965,7 +3577,7 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_DYNAMIC_FTRACE
        entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
                                    &ftrace_update_tot_cnt,
-                                   &tracing_read_long_fops);
+                                   &tracing_dyn_info_fops);
        if (!entry)
                pr_warning("Could not create debugfs "
                           "'dyn_ftrace_total_info' entry\n");
@@ -2976,7 +3588,7 @@ static __init int tracer_init_debugfs(void)
        return 0;
 }
 
-int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
        static DEFINE_SPINLOCK(trace_buf_lock);
        static char trace_buf[TRACE_BUF_SIZE];
@@ -2984,11 +3596,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        struct ring_buffer_event *event;
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
-       struct print_entry *entry;
-       unsigned long flags, irq_flags;
        int cpu, len = 0, size, pc;
+       struct print_entry *entry;
+       unsigned long irq_flags;
 
-       if (!tr->ctrl || tracing_disabled)
+       if (tracing_disabled || tracing_selftest_running)
                return 0;
 
        pc = preempt_count();
@@ -2999,7 +3611,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        if (unlikely(atomic_read(&data->disabled)))
                goto out;
 
-       spin_lock_irqsave(&trace_buf_lock, flags);
+       pause_graph_tracing();
+       spin_lock_irqsave(&trace_buf_lock, irq_flags);
        len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
        len = min(len, TRACE_BUF_SIZE-1);
@@ -3010,17 +3623,18 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        if (!event)
                goto out_unlock;
        entry = ring_buffer_event_data(event);
-       tracing_generic_entry_update(&entry->ent, flags, pc);
+       tracing_generic_entry_update(&entry->ent, irq_flags, pc);
        entry->ent.type                 = TRACE_PRINT;
        entry->ip                       = ip;
+       entry->depth                    = depth;
 
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = 0;
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
  out_unlock:
-       spin_unlock_irqrestore(&trace_buf_lock, flags);
-
+       spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+       unpause_graph_tracing();
  out:
        preempt_enable_notrace();
 
@@ -3037,7 +3651,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
                return 0;
 
        va_start(ap, fmt);
-       ret = trace_vprintk(ip, fmt, ap);
+       ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
        va_end(ap);
        return ret;
 }
@@ -3046,7 +3660,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
 static int trace_panic_handler(struct notifier_block *this,
                               unsigned long event, void *unused)
 {
-       ftrace_dump();
+       if (ftrace_dump_on_oops)
+               ftrace_dump();
        return NOTIFY_OK;
 }
 
@@ -3062,7 +3677,8 @@ static int trace_die_handler(struct notifier_block *self,
 {
        switch (val) {
        case DIE_OOPS:
-               ftrace_dump();
+               if (ftrace_dump_on_oops)
+                       ftrace_dump();
                break;
        default:
                break;
@@ -3103,7 +3719,6 @@ trace_printk_seq(struct trace_seq *s)
        trace_seq_reset(s);
 }
 
-
 void ftrace_dump(void)
 {
        static DEFINE_SPINLOCK(ftrace_dump_lock);
@@ -3128,6 +3743,9 @@ void ftrace_dump(void)
                atomic_inc(&global_trace.data[cpu]->disabled);
        }
 
+       /* don't look at user memory in panic mode */
+       trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
        printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
        iter.tr = &global_trace;
@@ -3221,7 +3839,6 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
        /* All seems OK, enable tracing */
-       global_trace.ctrl = tracer_enabled;
        tracing_disabled = 0;
 
        atomic_notifier_chain_register(&panic_notifier_list,
index 8465ad052707afe380e2fba0017c0f37fb1d5093..5ac697065a48cb7bd2c08adbdf98d6ea3c42d93e 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
+#include <trace/boot.h>
 
 enum trace_type {
        __TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,14 @@ enum trace_type {
        TRACE_SPECIAL,
        TRACE_MMIO_RW,
        TRACE_MMIO_MAP,
-       TRACE_BOOT,
+       TRACE_BRANCH,
+       TRACE_BOOT_CALL,
+       TRACE_BOOT_RET,
+       TRACE_GRAPH_RET,
+       TRACE_GRAPH_ENT,
+       TRACE_USER_STACK,
+       TRACE_BTS,
+       TRACE_POWER,
 
        __TRACE_LAST_TYPE
 };
@@ -38,6 +46,7 @@ struct trace_entry {
        unsigned char           flags;
        unsigned char           preempt_count;
        int                     pid;
+       int                     tgid;
 };
 
 /*
@@ -48,6 +57,18 @@ struct ftrace_entry {
        unsigned long           ip;
        unsigned long           parent_ip;
 };
+
+/* Function call entry */
+struct ftrace_graph_ent_entry {
+       struct trace_entry                      ent;
+       struct ftrace_graph_ent         graph_ent;
+};
+
+/* Function return entry */
+struct ftrace_graph_ret_entry {
+       struct trace_entry                      ent;
+       struct ftrace_graph_ret         ret;
+};
 extern struct tracer boot_tracer;
 
 /*
@@ -85,12 +106,18 @@ struct stack_entry {
        unsigned long           caller[FTRACE_STACK_ENTRIES];
 };
 
+struct userstack_entry {
+       struct trace_entry      ent;
+       unsigned long           caller[FTRACE_STACK_ENTRIES];
+};
+
 /*
  * ftrace_printk entry:
  */
 struct print_entry {
        struct trace_entry      ent;
        unsigned long           ip;
+       int                     depth;
        char                    buf[];
 };
 
@@ -112,9 +139,35 @@ struct trace_mmiotrace_map {
        struct mmiotrace_map    map;
 };
 
-struct trace_boot {
+struct trace_boot_call {
        struct trace_entry      ent;
-       struct boot_trace       initcall;
+       struct boot_trace_call boot_call;
+};
+
+struct trace_boot_ret {
+       struct trace_entry      ent;
+       struct boot_trace_ret boot_ret;
+};
+
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_branch {
+       struct trace_entry      ent;
+       unsigned                line;
+       char                    func[TRACE_FUNC_SIZE+1];
+       char                    file[TRACE_FILE_SIZE+1];
+       char                    correct;
+};
+
+struct bts_entry {
+       struct trace_entry      ent;
+       unsigned long           from;
+       unsigned long           to;
+};
+
+struct trace_power {
+       struct trace_entry      ent;
+       struct power_trace      state_data;
 };
 
 /*
@@ -172,7 +225,6 @@ struct trace_iterator;
 struct trace_array {
        struct ring_buffer      *buffer;
        unsigned long           entries;
-       long                    ctrl;
        int                     cpu;
        cycle_t                 time_start;
        struct task_struct      *waiter;
@@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void);
                IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);        \
                IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
                IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);   \
+               IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
                IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);   \
                IF_ASSIGN(var, ent, struct special_entry, 0);           \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,          \
                          TRACE_MMIO_RW);                               \
                IF_ASSIGN(var, ent, struct trace_mmiotrace_map,         \
                          TRACE_MMIO_MAP);                              \
-               IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);     \
+               IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
+               IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
+               IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
+               IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry,      \
+                         TRACE_GRAPH_ENT);             \
+               IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,      \
+                         TRACE_GRAPH_RET);             \
+               IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
+               IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
                __ftrace_bad_type();                                    \
        } while (0)
 
@@ -229,29 +290,56 @@ enum print_line_t {
        TRACE_TYPE_UNHANDLED    = 2     /* Relay to other output functions */
 };
 
+
+/*
+ * An option specific to a tracer. This is a boolean value.
+ * The bit is the bit index that sets its value on the
+ * flags value in struct tracer_flags.
+ */
+struct tracer_opt {
+       const char      *name; /* Will appear on the trace_options file */
+       u32             bit; /* Mask assigned in val field in tracer_flags */
+};
+
+/*
+ * The set of specific options for a tracer. Your tracer
+ * have to set the initial value of the flags val.
+ */
+struct tracer_flags {
+       u32                     val;
+       struct tracer_opt       *opts;
+};
+
+/* Makes more easy to define a tracer opt */
+#define TRACER_OPT(s, b)       .name = #s, .bit = b
+
 /*
  * A specific tracer, represented by methods that operate on a trace array:
  */
 struct tracer {
        const char              *name;
-       void                    (*init)(struct trace_array *tr);
+       /* Your tracer should raise a warning if init fails */
+       int                     (*init)(struct trace_array *tr);
        void                    (*reset)(struct trace_array *tr);
+       void                    (*start)(struct trace_array *tr);
+       void                    (*stop)(struct trace_array *tr);
        void                    (*open)(struct trace_iterator *iter);
        void                    (*pipe_open)(struct trace_iterator *iter);
        void                    (*close)(struct trace_iterator *iter);
-       void                    (*start)(struct trace_iterator *iter);
-       void                    (*stop)(struct trace_iterator *iter);
        ssize_t                 (*read)(struct trace_iterator *iter,
                                        struct file *filp, char __user *ubuf,
                                        size_t cnt, loff_t *ppos);
-       void                    (*ctrl_update)(struct trace_array *tr);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
        int                     (*selftest)(struct tracer *trace,
                                            struct trace_array *tr);
 #endif
+       void                    (*print_header)(struct seq_file *m);
        enum print_line_t       (*print_line)(struct trace_iterator *iter);
+       /* If you handled the flag setting, return 0 */
+       int                     (*set_flag)(u32 old_flags, u32 bit, int set);
        struct tracer           *next;
        int                     print_max;
+       struct tracer_flags     *flags;
 };
 
 struct trace_seq {
@@ -279,8 +367,11 @@ struct trace_iterator {
        unsigned long           iter_flags;
        loff_t                  pos;
        long                    idx;
+
+       cpumask_t               started;
 };
 
+int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -321,8 +412,17 @@ void trace_function(struct trace_array *tr,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
 
+void trace_graph_return(struct ftrace_graph_ret *trace);
+int trace_graph_entry(struct ftrace_graph_ent *trace);
+void trace_bts(struct trace_array *tr,
+              unsigned long from,
+              unsigned long to);
+
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
+void tracing_sched_switch_assign_trace(struct trace_array *tr);
+void tracing_stop_sched_switch_record(void);
+void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
@@ -358,6 +458,7 @@ struct tracer_switch_ops {
        struct tracer_switch_ops        *next;
 };
 
+char *trace_find_cmdline(int pid);
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -383,19 +484,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
                                               struct trace_array *tr);
 extern int trace_selftest_startup_sysprof(struct tracer *trace,
                                               struct trace_array *tr);
+extern int trace_selftest_startup_branch(struct tracer *trace,
+                                        struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
 extern void trace_seq_print_cont(struct trace_seq *s,
                                 struct trace_iterator *iter);
+
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+               unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
                                 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
-extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
+extern int
+trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
+/* Standard output formatting function used for function return traces */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/* TODO: make this variable */
+#define FTRACE_GRAPH_MAX_FUNCS         32
+extern int ftrace_graph_count;
+extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
+
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+       int i;
+
+       if (!ftrace_graph_count || test_tsk_trace_graph(current))
+               return 1;
+
+       for (i = 0; i < ftrace_graph_count; i++) {
+               if (addr == ftrace_graph_funcs[i])
+                       return 1;
+       }
+
+       return 0;
+}
+#else
+static inline int ftrace_trace_addr(unsigned long addr)
+{
+       return 1;
+}
+static inline int ftrace_graph_addr(unsigned long addr)
+{
+       return 1;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#else /* CONFIG_FUNCTION_GRAPH_TRACER */
+static inline enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+       return TRACE_TYPE_UNHANDLED;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+extern struct pid *ftrace_pid_trace;
+
+static inline int ftrace_trace_task(struct task_struct *task)
+{
+       if (!ftrace_pid_trace)
+               return 1;
+
+       return test_tsk_trace_trace(task);
+}
+
 /*
  * trace_iterator_flags is an enumeration that defines bit
  * positions into trace_flags that controls the output.
@@ -415,8 +576,92 @@ enum trace_iterator_flags {
        TRACE_ITER_STACKTRACE           = 0x100,
        TRACE_ITER_SCHED_TREE           = 0x200,
        TRACE_ITER_PRINTK               = 0x400,
+       TRACE_ITER_PREEMPTONLY          = 0x800,
+       TRACE_ITER_BRANCH               = 0x1000,
+       TRACE_ITER_ANNOTATE             = 0x2000,
+       TRACE_ITER_USERSTACKTRACE       = 0x4000,
+       TRACE_ITER_SYM_USEROBJ          = 0x8000
 };
 
+/*
+ * TRACE_ITER_SYM_MASK masks the options in trace_flags that
+ * control the output of kernel symbols.
+ */
+#define TRACE_ITER_SYM_MASK \
+       (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
+
 extern struct tracer nop_trace;
 
+/**
+ * ftrace_preempt_disable - disable preemption scheduler safe
+ *
+ * When tracing can happen inside the scheduler, there exists
+ * cases that the tracing might happen before the need_resched
+ * flag is checked. If this happens and the tracer calls
+ * preempt_enable (after a disable), a schedule might take place
+ * causing an infinite recursion.
+ *
+ * To prevent this, we read the need_recshed flag before
+ * disabling preemption. When we want to enable preemption we
+ * check the flag, if it is set, then we call preempt_enable_no_resched.
+ * Otherwise, we call preempt_enable.
+ *
+ * The rational for doing the above is that if need resched is set
+ * and we have yet to reschedule, we are either in an atomic location
+ * (where we do not need to check for scheduling) or we are inside
+ * the scheduler and do not want to resched.
+ */
+static inline int ftrace_preempt_disable(void)
+{
+       int resched;
+
+       resched = need_resched();
+       preempt_disable_notrace();
+
+       return resched;
+}
+
+/**
+ * ftrace_preempt_enable - enable preemption scheduler safe
+ * @resched: the return value from ftrace_preempt_disable
+ *
+ * This is a scheduler safe way to enable preemption and not miss
+ * any preemption checks. The disabled saved the state of preemption.
+ * If resched is set, then we were either inside an atomic or
+ * are inside the scheduler (we would have already scheduled
+ * otherwise). In this case, we do not want to call normal
+ * preempt_enable, but preempt_enable_no_resched instead.
+ */
+static inline void ftrace_preempt_enable(int resched)
+{
+       if (resched)
+               preempt_enable_no_resched_notrace();
+       else
+               preempt_enable_notrace();
+}
+
+#ifdef CONFIG_BRANCH_TRACER
+extern int enable_branch_tracing(struct trace_array *tr);
+extern void disable_branch_tracing(void);
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+       if (trace_flags & TRACE_ITER_BRANCH)
+               return enable_branch_tracing(tr);
+       return 0;
+}
+static inline void trace_branch_disable(void)
+{
+       /* due to races, always disable */
+       disable_branch_tracing();
+}
+#else
+static inline int trace_branch_enable(struct trace_array *tr)
+{
+       return 0;
+}
+static inline void trace_branch_disable(void)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
 #endif /* _LINUX_KERNEL_TRACE_H */
index d0a5e50eeff26d17405ffa4cb62cbde710f9de05..a4fa2c57e34e376e0f58920e816db6c159530bcb 100644 (file)
 #include "trace.h"
 
 static struct trace_array *boot_trace;
-static int trace_boot_enabled;
+static bool pre_initcalls_finished;
 
-
-/* Should be started after do_pre_smp_initcalls() in init/main.c */
+/* Tells the boot tracer that the pre_smp_initcalls are finished.
+ * So we are ready .
+ * It doesn't enable sched events tracing however.
+ * You have to call enable_boot_trace to do so.
+ */
 void start_boot_trace(void)
 {
-       trace_boot_enabled = 1;
+       pre_initcalls_finished = true;
 }
 
-void stop_boot_trace(void)
+void enable_boot_trace(void)
 {
-       trace_boot_enabled = 0;
+       if (pre_initcalls_finished)
+               tracing_start_sched_switch_record();
 }
 
-void reset_boot_trace(struct trace_array *tr)
+void disable_boot_trace(void)
 {
-       stop_boot_trace();
+       if (pre_initcalls_finished)
+               tracing_stop_sched_switch_record();
 }
 
-static void boot_trace_init(struct trace_array *tr)
+static void reset_boot_trace(struct trace_array *tr)
 {
        int cpu;
-       boot_trace = tr;
 
-       trace_boot_enabled = 0;
+       tr->time_start = ftrace_now(tr->cpu);
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr, cpu);
+}
+
+static int boot_trace_init(struct trace_array *tr)
+{
+       int cpu;
+       boot_trace = tr;
 
        for_each_cpu_mask(cpu, cpu_possible_map)
                tracing_reset(tr, cpu);
+
+       tracing_sched_switch_assign_trace(tr);
+       return 0;
 }
 
-static void boot_trace_ctrl_update(struct trace_array *tr)
+static enum print_line_t
+initcall_call_print_line(struct trace_iterator *iter)
 {
-       if (tr->ctrl)
-               start_boot_trace();
+       struct trace_entry *entry = iter->ent;
+       struct trace_seq *s = &iter->seq;
+       struct trace_boot_call *field;
+       struct boot_trace_call *call;
+       u64 ts;
+       unsigned long nsec_rem;
+       int ret;
+
+       trace_assign_type(field, entry);
+       call = &field->boot_call;
+       ts = iter->ts;
+       nsec_rem = do_div(ts, 1000000000);
+
+       ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
+                       (unsigned long)ts, nsec_rem, call->func, call->caller);
+
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
        else
-               stop_boot_trace();
+               return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+static enum print_line_t
+initcall_ret_print_line(struct trace_iterator *iter)
 {
-       int ret;
        struct trace_entry *entry = iter->ent;
-       struct trace_boot *field = (struct trace_boot *)entry;
-       struct boot_trace *it = &field->initcall;
        struct trace_seq *s = &iter->seq;
-       struct timespec calltime = ktime_to_timespec(it->calltime);
-       struct timespec rettime = ktime_to_timespec(it->rettime);
-
-       if (entry->type == TRACE_BOOT) {
-               ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
-                                         calltime.tv_sec,
-                                         calltime.tv_nsec,
-                                         it->func, it->caller);
-               if (!ret)
-                       return TRACE_TYPE_PARTIAL_LINE;
-
-               ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
-                                         "returned %d after %lld msecs\n",
-                                         rettime.tv_sec,
-                                         rettime.tv_nsec,
-                                         it->func, it->result, it->duration);
-
-               if (!ret)
-                       return TRACE_TYPE_PARTIAL_LINE;
+       struct trace_boot_ret *field;
+       struct boot_trace_ret *init_ret;
+       u64 ts;
+       unsigned long nsec_rem;
+       int ret;
+
+       trace_assign_type(field, entry);
+       init_ret = &field->boot_ret;
+       ts = iter->ts;
+       nsec_rem = do_div(ts, 1000000000);
+
+       ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
+                       "returned %d after %llu msecs\n",
+                       (unsigned long) ts,
+                       nsec_rem,
+                       init_ret->func, init_ret->result, init_ret->duration);
+
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+       else
                return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t initcall_print_line(struct trace_iterator *iter)
+{
+       struct trace_entry *entry = iter->ent;
+
+       switch (entry->type) {
+       case TRACE_BOOT_CALL:
+               return initcall_call_print_line(iter);
+       case TRACE_BOOT_RET:
+               return initcall_ret_print_line(iter);
+       default:
+               return TRACE_TYPE_UNHANDLED;
        }
-       return TRACE_TYPE_UNHANDLED;
 }
 
 struct tracer boot_tracer __read_mostly =
@@ -87,27 +131,53 @@ struct tracer boot_tracer __read_mostly =
        .name           = "initcall",
        .init           = boot_trace_init,
        .reset          = reset_boot_trace,
-       .ctrl_update    = boot_trace_ctrl_update,
        .print_line     = initcall_print_line,
 };
 
-void trace_boot(struct boot_trace *it, initcall_t fn)
+void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
        struct ring_buffer_event *event;
-       struct trace_boot *entry;
-       struct trace_array_cpu *data;
+       struct trace_boot_call *entry;
        unsigned long irq_flags;
        struct trace_array *tr = boot_trace;
 
-       if (!trace_boot_enabled)
+       if (!pre_initcalls_finished)
                return;
 
        /* Get its name now since this function could
         * disappear because it is in the .init section.
         */
-       sprint_symbol(it->func, (unsigned long)fn);
+       sprint_symbol(bt->func, (unsigned long)fn);
+       preempt_disable();
+
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               goto out;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, 0, 0);
+       entry->ent.type = TRACE_BOOT_CALL;
+       entry->boot_call = *bt;
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+       trace_wake_up();
+
+ out:
+       preempt_enable();
+}
+
+void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
+{
+       struct ring_buffer_event *event;
+       struct trace_boot_ret *entry;
+       unsigned long irq_flags;
+       struct trace_array *tr = boot_trace;
+
+       if (!pre_initcalls_finished)
+               return;
+
+       sprint_symbol(bt->func, (unsigned long)fn);
        preempt_disable();
-       data = tr->data[smp_processor_id()];
 
        event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
                                         &irq_flags);
@@ -115,8 +185,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
                goto out;
        entry   = ring_buffer_event_data(event);
        tracing_generic_entry_update(&entry->ent, 0, 0);
-       entry->ent.type = TRACE_BOOT;
-       entry->initcall = *it;
+       entry->ent.type = TRACE_BOOT_RET;
+       entry->boot_ret = *bt;
        ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
 
        trace_wake_up();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644 (file)
index 0000000..6c00feb
--- /dev/null
@@ -0,0 +1,342 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/irqflags.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+#ifdef CONFIG_BRANCH_TRACER
+
+static int branch_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(branch_tracing_mutex);
+static struct trace_array *branch_tracer;
+
+static void
+probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+       struct trace_array *tr = branch_tracer;
+       struct ring_buffer_event *event;
+       struct trace_branch *entry;
+       unsigned long flags, irq_flags;
+       int cpu, pc;
+       const char *p;
+
+       /*
+        * I would love to save just the ftrace_likely_data pointer, but
+        * this code can also be used by modules. Ugly things can happen
+        * if the module is unloaded, and then we go and read the
+        * pointer.  This is slower, but much safer.
+        */
+
+       if (unlikely(!tr))
+               return;
+
+       local_irq_save(flags);
+       cpu = raw_smp_processor_id();
+       if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+               goto out;
+
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               goto out;
+
+       pc = preempt_count();
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, flags, pc);
+       entry->ent.type         = TRACE_BRANCH;
+
+       /* Strip off the path, only save the file */
+       p = f->file + strlen(f->file);
+       while (p >= f->file && *p != '/')
+               p--;
+       p++;
+
+       strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+       strncpy(entry->file, p, TRACE_FILE_SIZE);
+       entry->func[TRACE_FUNC_SIZE] = 0;
+       entry->file[TRACE_FILE_SIZE] = 0;
+       entry->line = f->line;
+       entry->correct = val == expect;
+
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+       atomic_dec(&tr->data[cpu]->disabled);
+       local_irq_restore(flags);
+}
+
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+       if (!branch_tracing_enabled)
+               return;
+
+       probe_likely_condition(f, val, expect);
+}
+
+int enable_branch_tracing(struct trace_array *tr)
+{
+       int ret = 0;
+
+       mutex_lock(&branch_tracing_mutex);
+       branch_tracer = tr;
+       /*
+        * Must be seen before enabling. The reader is a condition
+        * where we do not need a matching rmb()
+        */
+       smp_wmb();
+       branch_tracing_enabled++;
+       mutex_unlock(&branch_tracing_mutex);
+
+       return ret;
+}
+
+void disable_branch_tracing(void)
+{
+       mutex_lock(&branch_tracing_mutex);
+
+       if (!branch_tracing_enabled)
+               goto out_unlock;
+
+       branch_tracing_enabled--;
+
+ out_unlock:
+       mutex_unlock(&branch_tracing_mutex);
+}
+
+static void start_branch_trace(struct trace_array *tr)
+{
+       enable_branch_tracing(tr);
+}
+
+static void stop_branch_trace(struct trace_array *tr)
+{
+       disable_branch_tracing();
+}
+
+static int branch_trace_init(struct trace_array *tr)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr, cpu);
+
+       start_branch_trace(tr);
+       return 0;
+}
+
+static void branch_trace_reset(struct trace_array *tr)
+{
+       stop_branch_trace(tr);
+}
+
+struct tracer branch_trace __read_mostly =
+{
+       .name           = "branch",
+       .init           = branch_trace_init,
+       .reset          = branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest       = trace_selftest_startup_branch,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+       return register_tracer(&branch_trace);
+}
+
+device_initcall(init_branch_trace);
+#else
+static inline
+void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_BRANCH_TRACER */
+
+void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
+{
+       /*
+        * I would love to have a trace point here instead, but the
+        * trace point code is so inundated with unlikely and likely
+        * conditions that the recursive nightmare that exists is too
+        * much to try to get working. At least for now.
+        */
+       trace_likely_condition(f, val, expect);
+
+       /* FIXME: Make this atomic! */
+       if (val == expect)
+               f->correct++;
+       else
+               f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+       void            *start;
+       void            *stop;
+       int             hit;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       const struct ftrace_pointer *f = m->private;
+       struct ftrace_branch_data *p = v;
+
+       (*pos)++;
+
+       if (v == (void *)1)
+               return f->start;
+
+       ++p;
+
+       if ((void *)p >= (void *)f->stop)
+               return NULL;
+
+       return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+       void *t = (void *)1;
+       loff_t l = 0;
+
+       for (; t && l < *pos; t = t_next(m, t, &l))
+               ;
+
+       return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+       const struct ftrace_pointer *fp = m->private;
+       struct ftrace_branch_data *p = v;
+       const char *f;
+       long percent;
+
+       if (v == (void *)1) {
+               if (fp->hit)
+                       seq_printf(m, "   miss      hit    %% ");
+               else
+                       seq_printf(m, " correct incorrect  %% ");
+               seq_printf(m, "       Function                "
+                             "  File              Line\n"
+                             " ------- ---------  - "
+                             "       --------                "
+                             "  ----              ----\n");
+               return 0;
+       }
+
+       /* Only print the file, not the path */
+       f = p->file + strlen(p->file);
+       while (f >= p->file && *f != '/')
+               f--;
+       f++;
+
+       /*
+        * The miss is overlayed on correct, and hit on incorrect.
+        */
+       if (p->correct) {
+               percent = p->incorrect * 100;
+               percent /= p->correct + p->incorrect;
+       } else
+               percent = p->incorrect ? 100 : -1;
+
+       seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
+       if (percent < 0)
+               seq_printf(m, "  X ");
+       else
+               seq_printf(m, "%3ld ", percent);
+       seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+       return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+       .start          = t_start,
+       .next           = t_next,
+       .stop           = t_stop,
+       .show           = t_show,
+};
+
+static int tracing_branch_open(struct inode *inode, struct file *file)
+{
+       int ret;
+
+       ret = seq_open(file, &tracing_likely_seq_ops);
+       if (!ret) {
+               struct seq_file *m = file->private_data;
+               m->private = (void *)inode->i_private;
+       }
+
+       return ret;
+}
+
+static const struct file_operations tracing_branch_fops = {
+       .open           = tracing_branch_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+};
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
+
+static const struct ftrace_pointer ftrace_branch_pos = {
+       .start                  = __start_branch_profile,
+       .stop                   = __stop_branch_profile,
+       .hit                    = 1,
+};
+
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
+
+static const struct ftrace_pointer ftrace_annotated_branch_pos = {
+       .start                  = __start_annotated_branch_profile,
+       .stop                   = __stop_annotated_branch_profile,
+};
+
+static __init int ftrace_branch_init(void)
+{
+       struct dentry *d_tracer;
+       struct dentry *entry;
+
+       d_tracer = tracing_init_dentry();
+
+       entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
+                                   (void *)&ftrace_annotated_branch_pos,
+                                   &tracing_branch_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'profile_annotatet_branch' entry\n");
+
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+       entry = debugfs_create_file("profile_branch", 0444, d_tracer,
+                                   (void *)&ftrace_branch_pos,
+                                   &tracing_branch_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs"
+                          " 'profile_branch' entry\n");
+#endif
+
+       return 0;
+}
+
+device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
new file mode 100644 (file)
index 0000000..23b76e4
--- /dev/null
@@ -0,0 +1,276 @@
+/*
+ * BTS tracer
+ *
+ * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include <asm/ds.h>
+
+#include "trace.h"
+
+
+#define SIZEOF_BTS (1 << 13)
+
+static DEFINE_PER_CPU(struct bts_tracer *, tracer);
+static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+
+#define this_tracer per_cpu(tracer, smp_processor_id())
+#define this_buffer per_cpu(buffer, smp_processor_id())
+
+
+/*
+ * Information to interpret a BTS record.
+ * This will go into an in-kernel BTS interface.
+ */
+static unsigned char sizeof_field;
+static unsigned long debugctl_mask;
+
+#define sizeof_bts (3 * sizeof_field)
+
+static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
+{
+       switch (c->x86) {
+       case 0x6:
+               switch (c->x86_model) {
+               case 0x0 ... 0xC:
+                       break;
+               case 0xD:
+               case 0xE: /* Pentium M */
+                       sizeof_field = sizeof(long);
+                       debugctl_mask = (1<<6)|(1<<7);
+                       break;
+               default:
+                       sizeof_field = 8;
+                       debugctl_mask = (1<<6)|(1<<7);
+                       break;
+               }
+               break;
+       case 0xF:
+               switch (c->x86_model) {
+               case 0x0:
+               case 0x1:
+               case 0x2: /* Netburst */
+                       sizeof_field = sizeof(long);
+                       debugctl_mask = (1<<2)|(1<<3);
+                       break;
+               default:
+                       /* sorry, don't know about them */
+                       break;
+               }
+               break;
+       default:
+               /* sorry, don't know about them */
+               break;
+       }
+}
+
+static inline void bts_enable(void)
+{
+       unsigned long debugctl;
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
+}
+
+static inline void bts_disable(void)
+{
+       unsigned long debugctl;
+
+       rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+       wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
+}
+
+static void bts_trace_reset(struct trace_array *tr)
+{
+       int cpu;
+
+       tr->time_start = ftrace_now(tr->cpu);
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr, cpu);
+}
+
+static void bts_trace_start_cpu(void *arg)
+{
+       this_tracer =
+               ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
+                              /* ovfl = */ NULL, /* th = */ (size_t)-1);
+       if (IS_ERR(this_tracer)) {
+               this_tracer = NULL;
+               return;
+       }
+
+       bts_enable();
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+       int cpu;
+
+       bts_trace_reset(tr);
+
+       for_each_cpu_mask(cpu, cpu_possible_map)
+               smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+}
+
+static void bts_trace_stop_cpu(void *arg)
+{
+       if (this_tracer) {
+               bts_disable();
+
+               ds_release_bts(this_tracer);
+               this_tracer = NULL;
+       }
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+       int cpu;
+
+       for_each_cpu_mask(cpu, cpu_possible_map)
+               smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+}
+
+static int bts_trace_init(struct trace_array *tr)
+{
+       bts_trace_cpuinit(&boot_cpu_data);
+       bts_trace_reset(tr);
+       bts_trace_start(tr);
+
+       return 0;
+}
+
+static void bts_trace_print_header(struct seq_file *m)
+{
+#ifdef __i386__
+       seq_puts(m, "# CPU#    FROM           TO     FUNCTION\n");
+       seq_puts(m, "#  |       |             |         |\n");
+#else
+       seq_puts(m,
+                "# CPU#        FROM                   TO         FUNCTION\n");
+       seq_puts(m,
+                "#  |           |                     |             |\n");
+#endif
+}
+
+static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
+{
+       struct trace_entry *entry = iter->ent;
+       struct trace_seq *seq = &iter->seq;
+       struct bts_entry *it;
+
+       trace_assign_type(it, entry);
+
+       if (entry->type == TRACE_BTS) {
+               int ret;
+#ifdef CONFIG_KALLSYMS
+               char function[KSYM_SYMBOL_LEN];
+               sprint_symbol(function, it->from);
+#else
+               char *function = "<unknown>";
+#endif
+
+               ret = trace_seq_printf(seq, "%4d  0x%lx -> 0x%lx [%s]\n",
+                                      entry->cpu, it->from, it->to, function);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;;
+               return TRACE_TYPE_HANDLED;
+       }
+       return TRACE_TYPE_UNHANDLED;
+}
+
+void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
+{
+       struct ring_buffer_event *event;
+       struct bts_entry *entry;
+       unsigned long irq;
+
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
+       if (!event)
+               return;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, 0, from);
+       entry->ent.type = TRACE_BTS;
+       entry->ent.cpu = smp_processor_id();
+       entry->from = from;
+       entry->to   = to;
+       ring_buffer_unlock_commit(tr->buffer, event, irq);
+}
+
+static void trace_bts_at(struct trace_array *tr, size_t index)
+{
+       const void *raw = NULL;
+       unsigned long from, to;
+       int err;
+
+       err = ds_access_bts(this_tracer, index, &raw);
+       if (err < 0)
+               return;
+
+       from = *(const unsigned long *)raw;
+       to = *(const unsigned long *)((const char *)raw + sizeof_field);
+
+       trace_bts(tr, from, to);
+}
+
+static void trace_bts_cpu(void *arg)
+{
+       struct trace_array *tr = (struct trace_array *) arg;
+       size_t index = 0, end = 0, i;
+       int err;
+
+       if (!this_tracer)
+               return;
+
+       bts_disable();
+
+       err = ds_get_bts_index(this_tracer, &index);
+       if (err < 0)
+               goto out;
+
+       err = ds_get_bts_end(this_tracer, &end);
+       if (err < 0)
+               goto out;
+
+       for (i = index; i < end; i++)
+               trace_bts_at(tr, i);
+
+       for (i = 0; i < index; i++)
+               trace_bts_at(tr, i);
+
+out:
+       bts_enable();
+}
+
+static void trace_bts_prepare(struct trace_iterator *iter)
+{
+       int cpu;
+
+       for_each_cpu_mask(cpu, cpu_possible_map)
+               smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+}
+
+struct tracer bts_tracer __read_mostly =
+{
+       .name           = "bts",
+       .init           = bts_trace_init,
+       .reset          = bts_trace_stop,
+       .print_header   = bts_trace_print_header,
+       .print_line     = bts_trace_print_line,
+       .start          = bts_trace_start,
+       .stop           = bts_trace_stop,
+       .open           = trace_bts_prepare
+};
+
+__init static int init_bts_trace(void)
+{
+       return register_tracer(&bts_tracer);
+}
+device_initcall(init_bts_trace);
index 0f85a64003d3980d1002d1877712808703f25cb9..e74f6d0a321663b3610fb0e37d250c4fbf9696dc 100644 (file)
@@ -42,24 +42,20 @@ static void stop_function_trace(struct trace_array *tr)
        tracing_stop_cmdline_record();
 }
 
-static void function_trace_init(struct trace_array *tr)
+static int function_trace_init(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               start_function_trace(tr);
+       start_function_trace(tr);
+       return 0;
 }
 
 static void function_trace_reset(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               stop_function_trace(tr);
+       stop_function_trace(tr);
 }
 
-static void function_trace_ctrl_update(struct trace_array *tr)
+static void function_trace_start(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               start_function_trace(tr);
-       else
-               stop_function_trace(tr);
+       function_reset(tr);
 }
 
 static struct tracer function_trace __read_mostly =
@@ -67,7 +63,7 @@ static struct tracer function_trace __read_mostly =
        .name        = "function",
        .init        = function_trace_init,
        .reset       = function_trace_reset,
-       .ctrl_update = function_trace_ctrl_update,
+       .start       = function_trace_start,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_function,
 #endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644 (file)
index 0000000..af60eef
--- /dev/null
@@ -0,0 +1,611 @@
+/*
+ *
+ * Function graph tracer.
+ * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Mostly borrowed from function tracer which
+ * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+#define TRACE_GRAPH_INDENT     2
+
+/* Flag options */
+#define TRACE_GRAPH_PRINT_OVERRUN      0x1
+#define TRACE_GRAPH_PRINT_CPU          0x2
+#define TRACE_GRAPH_PRINT_OVERHEAD     0x4
+#define TRACE_GRAPH_PRINT_PROC         0x8
+
+static struct tracer_opt trace_opts[] = {
+       /* Display overruns ? */
+       { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
+       /* Display CPU ? */
+       { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
+       /* Display Overhead ? */
+       { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
+       /* Display proc name/pid */
+       { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
+       { } /* Empty entry */
+};
+
+static struct tracer_flags tracer_flags = {
+       /* Don't display overruns and proc by default */
+       .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
+       .opts = trace_opts
+};
+
+/* pid on the last trace processed */
+static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+
+static int graph_trace_init(struct trace_array *tr)
+{
+       int cpu, ret;
+
+       for_each_online_cpu(cpu)
+               tracing_reset(tr, cpu);
+
+       ret = register_ftrace_graph(&trace_graph_return,
+                                       &trace_graph_entry);
+       if (ret)
+               return ret;
+       tracing_start_cmdline_record();
+
+       return 0;
+}
+
+static void graph_trace_reset(struct trace_array *tr)
+{
+       tracing_stop_cmdline_record();
+       unregister_ftrace_graph();
+}
+
+static inline int log10_cpu(int nb)
+{
+       if (nb / 100)
+               return 3;
+       if (nb / 10)
+               return 2;
+       return 1;
+}
+
+static enum print_line_t
+print_graph_cpu(struct trace_seq *s, int cpu)
+{
+       int i;
+       int ret;
+       int log10_this = log10_cpu(cpu);
+       int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
+
+
+       /*
+        * Start with a space character - to make it stand out
+        * to the right a bit when trace output is pasted into
+        * email:
+        */
+       ret = trace_seq_printf(s, " ");
+
+       /*
+        * Tricky - we space the CPU field according to the max
+        * number of online CPUs. On a 2-cpu system it would take
+        * a maximum of 1 digit - on a 128 cpu system it would
+        * take up to 3 digits:
+        */
+       for (i = 0; i < log10_all - log10_this; i++) {
+               ret = trace_seq_printf(s, " ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+       ret = trace_seq_printf(s, "%d) ", cpu);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+#define TRACE_GRAPH_PROCINFO_LENGTH    14
+
+static enum print_line_t
+print_graph_proc(struct trace_seq *s, pid_t pid)
+{
+       int i;
+       int ret;
+       int len;
+       char comm[8];
+       int spaces = 0;
+       /* sign + log10(MAX_INT) + '\0' */
+       char pid_str[11];
+
+       strncpy(comm, trace_find_cmdline(pid), 7);
+       comm[7] = '\0';
+       sprintf(pid_str, "%d", pid);
+
+       /* 1 stands for the "-" character */
+       len = strlen(comm) + strlen(pid_str) + 1;
+
+       if (len < TRACE_GRAPH_PROCINFO_LENGTH)
+               spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
+
+       /* First spaces to align center */
+       for (i = 0; i < spaces / 2; i++) {
+               ret = trace_seq_printf(s, " ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Last spaces to align center */
+       for (i = 0; i < spaces - (spaces / 2); i++) {
+               ret = trace_seq_printf(s, " ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+       return TRACE_TYPE_HANDLED;
+}
+
+
+/* If the pid changed since the last trace, output this event */
+static enum print_line_t
+verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+{
+       pid_t prev_pid;
+       int ret;
+
+       if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
+               return TRACE_TYPE_HANDLED;
+
+       prev_pid = last_pid[cpu];
+       last_pid[cpu] = pid;
+
+/*
+ * Context-switch trace line:
+
+ ------------------------------------------
+ | 1)  migration/0--1  =>  sshd-1755
+ ------------------------------------------
+
+ */
+       ret = trace_seq_printf(s,
+               " ------------------------------------------\n");
+       if (!ret)
+               TRACE_TYPE_PARTIAL_LINE;
+
+       ret = print_graph_cpu(s, cpu);
+       if (ret == TRACE_TYPE_PARTIAL_LINE)
+               TRACE_TYPE_PARTIAL_LINE;
+
+       ret = print_graph_proc(s, prev_pid);
+       if (ret == TRACE_TYPE_PARTIAL_LINE)
+               TRACE_TYPE_PARTIAL_LINE;
+
+       ret = trace_seq_printf(s, " => ");
+       if (!ret)
+               TRACE_TYPE_PARTIAL_LINE;
+
+       ret = print_graph_proc(s, pid);
+       if (ret == TRACE_TYPE_PARTIAL_LINE)
+               TRACE_TYPE_PARTIAL_LINE;
+
+       ret = trace_seq_printf(s,
+               "\n ------------------------------------------\n\n");
+       if (!ret)
+               TRACE_TYPE_PARTIAL_LINE;
+
+       return ret;
+}
+
+static bool
+trace_branch_is_leaf(struct trace_iterator *iter,
+               struct ftrace_graph_ent_entry *curr)
+{
+       struct ring_buffer_iter *ring_iter;
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ret_entry *next;
+
+       ring_iter = iter->buffer_iter[iter->cpu];
+
+       if (!ring_iter)
+               return false;
+
+       event = ring_buffer_iter_peek(ring_iter, NULL);
+
+       if (!event)
+               return false;
+
+       next = ring_buffer_event_data(event);
+
+       if (next->ent.type != TRACE_GRAPH_RET)
+               return false;
+
+       if (curr->ent.pid != next->ent.pid ||
+                       curr->graph_ent.func != next->ret.func)
+               return false;
+
+       return true;
+}
+
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+       unsigned long nsecs_rem = do_div(duration, 1000);
+       /* log10(ULONG_MAX) + '\0' */
+       char msecs_str[21];
+       char nsecs_str[5];
+       int ret, len;
+       int i;
+
+       sprintf(msecs_str, "%lu", (unsigned long) duration);
+
+       /* Print msecs */
+       ret = trace_seq_printf(s, msecs_str);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       len = strlen(msecs_str);
+
+       /* Print nsecs (we don't want to exceed 7 numbers) */
+       if (len < 7) {
+               snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
+               ret = trace_seq_printf(s, ".%s", nsecs_str);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               len += strlen(nsecs_str);
+       }
+
+       ret = trace_seq_printf(s, " us ");
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Print remaining spaces to fit the row's width */
+       for (i = len; i < 7; i++) {
+               ret = trace_seq_printf(s, " ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       ret = trace_seq_printf(s, "|  ");
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+       return TRACE_TYPE_HANDLED;
+
+}
+
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+       /* Duration exceeded 100 msecs */
+       if (duration > 100000ULL)
+               return trace_seq_printf(s, "! ");
+
+       /* Duration exceeded 10 msecs */
+       if (duration > 10000ULL)
+               return trace_seq_printf(s, "+ ");
+
+       return trace_seq_printf(s, "  ");
+}
+
+/* Case of a leaf function on its call entry */
+static enum print_line_t
+print_graph_entry_leaf(struct trace_iterator *iter,
+               struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
+{
+       struct ftrace_graph_ret_entry *ret_entry;
+       struct ftrace_graph_ret *graph_ret;
+       struct ring_buffer_event *event;
+       struct ftrace_graph_ent *call;
+       unsigned long long duration;
+       int ret;
+       int i;
+
+       event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
+       ret_entry = ring_buffer_event_data(event);
+       graph_ret = &ret_entry->ret;
+       call = &entry->graph_ent;
+       duration = graph_ret->rettime - graph_ret->calltime;
+
+       /* Overhead */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+               ret = print_graph_overhead(duration, s);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* Duration */
+       ret = print_graph_duration(duration, s);
+       if (ret == TRACE_TYPE_PARTIAL_LINE)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Function */
+       for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+               ret = trace_seq_printf(s, " ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       ret = seq_print_ip_sym(s, call->func, 0);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       ret = trace_seq_printf(s, "();\n");
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
+                       struct trace_seq *s)
+{
+       int i;
+       int ret;
+       struct ftrace_graph_ent *call = &entry->graph_ent;
+
+       /* No overhead */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+               ret = trace_seq_printf(s, "  ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* No time */
+       ret = trace_seq_printf(s, "            |  ");
+
+       /* Function */
+       for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
+               ret = trace_seq_printf(s, " ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       ret = seq_print_ip_sym(s, call->func, 0);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       ret = trace_seq_printf(s, "() {\n");
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+                       struct trace_iterator *iter, int cpu)
+{
+       int ret;
+       struct trace_entry *ent = iter->ent;
+
+       /* Pid */
+       if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Cpu */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+               ret = print_graph_cpu(s, cpu);
+               if (ret == TRACE_TYPE_PARTIAL_LINE)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* Proc */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+               ret = print_graph_proc(s, ent->pid);
+               if (ret == TRACE_TYPE_PARTIAL_LINE)
+                       return TRACE_TYPE_PARTIAL_LINE;
+
+               ret = trace_seq_printf(s, " | ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       if (trace_branch_is_leaf(iter, field))
+               return print_graph_entry_leaf(iter, field, s);
+       else
+               return print_graph_entry_nested(field, s);
+
+}
+
+static enum print_line_t
+print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+                  struct trace_entry *ent, int cpu)
+{
+       int i;
+       int ret;
+       unsigned long long duration = trace->rettime - trace->calltime;
+
+       /* Pid */
+       if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Cpu */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+               ret = print_graph_cpu(s, cpu);
+               if (ret == TRACE_TYPE_PARTIAL_LINE)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* Proc */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+               ret = print_graph_proc(s, ent->pid);
+               if (ret == TRACE_TYPE_PARTIAL_LINE)
+                       return TRACE_TYPE_PARTIAL_LINE;
+
+               ret = trace_seq_printf(s, " | ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* Overhead */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+               ret = print_graph_overhead(duration, s);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* Duration */
+       ret = print_graph_duration(duration, s);
+       if (ret == TRACE_TYPE_PARTIAL_LINE)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Closing brace */
+       for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
+               ret = trace_seq_printf(s, " ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       ret = trace_seq_printf(s, "}\n");
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Overrun */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
+               ret = trace_seq_printf(s, " (Overruns: %lu)\n",
+                                       trace->overrun);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+                  struct trace_entry *ent, struct trace_iterator *iter)
+{
+       int i;
+       int ret;
+
+       /* Pid */
+       if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Cpu */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+               ret = print_graph_cpu(s, iter->cpu);
+               if (ret == TRACE_TYPE_PARTIAL_LINE)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* Proc */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+               ret = print_graph_proc(s, ent->pid);
+               if (ret == TRACE_TYPE_PARTIAL_LINE)
+                       return TRACE_TYPE_PARTIAL_LINE;
+
+               ret = trace_seq_printf(s, " | ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* No overhead */
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+               ret = trace_seq_printf(s, "  ");
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+       }
+
+       /* No time */
+       ret = trace_seq_printf(s, "            |  ");
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       /* Indentation */
+       if (trace->depth > 0)
+               for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
+                       ret = trace_seq_printf(s, " ");
+                       if (!ret)
+                               return TRACE_TYPE_PARTIAL_LINE;
+               }
+
+       /* The comment */
+       ret = trace_seq_printf(s, "/* %s", trace->buf);
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       if (ent->flags & TRACE_FLAG_CONT)
+               trace_seq_print_cont(s, iter);
+
+       ret = trace_seq_printf(s, " */\n");
+       if (!ret)
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+
+enum print_line_t
+print_graph_function(struct trace_iterator *iter)
+{
+       struct trace_seq *s = &iter->seq;
+       struct trace_entry *entry = iter->ent;
+
+       switch (entry->type) {
+       case TRACE_GRAPH_ENT: {
+               struct ftrace_graph_ent_entry *field;
+               trace_assign_type(field, entry);
+               return print_graph_entry(field, s, iter,
+                                        iter->cpu);
+       }
+       case TRACE_GRAPH_RET: {
+               struct ftrace_graph_ret_entry *field;
+               trace_assign_type(field, entry);
+               return print_graph_return(&field->ret, s, entry, iter->cpu);
+       }
+       case TRACE_PRINT: {
+               struct print_entry *field;
+               trace_assign_type(field, entry);
+               return print_graph_comment(field, s, entry, iter);
+       }
+       default:
+               return TRACE_TYPE_UNHANDLED;
+       }
+}
+
+static void print_graph_headers(struct seq_file *s)
+{
+       /* 1st line */
+       seq_printf(s, "# ");
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+               seq_printf(s, "CPU ");
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+               seq_printf(s, "TASK/PID     ");
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
+               seq_printf(s, "OVERHEAD/");
+       seq_printf(s, "DURATION            FUNCTION CALLS\n");
+
+       /* 2nd line */
+       seq_printf(s, "# ");
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
+               seq_printf(s, "|   ");
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
+               seq_printf(s, "|      |     ");
+       if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+               seq_printf(s, "|        ");
+               seq_printf(s, "|                   |   |   |   |\n");
+       } else
+               seq_printf(s, "    |               |   |   |   |\n");
+}
+static struct tracer graph_trace __read_mostly = {
+       .name           = "function_graph",
+       .init           = graph_trace_init,
+       .reset          = graph_trace_reset,
+       .print_line     = print_graph_function,
+       .print_header   = print_graph_headers,
+       .flags          = &tracer_flags,
+};
+
+static __init int init_graph_trace(void)
+{
+       return register_tracer(&graph_trace);
+}
+
+device_initcall(init_graph_trace);
index 9c74071c10e0b5c8534039c50bd3845f04a036fe..7c2e326bbc8b15d942532d5951a38978fb66e194 100644 (file)
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
        register_ftrace_function(&trace_ops);
-       tracer_enabled = 1;
+       if (tracing_is_enabled()) {
+               tracer_enabled = 1;
+               save_tracer_enabled = 1;
+       } else {
+               tracer_enabled = 0;
+               save_tracer_enabled = 0;
+       }
 }
 
 static void stop_irqsoff_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
+       save_tracer_enabled = 0;
        unregister_ftrace_function(&trace_ops);
 }
 
@@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
        irqsoff_trace = tr;
        /* make sure that the tracer is visible */
        smp_wmb();
-
-       if (tr->ctrl)
-               start_irqsoff_tracer(tr);
+       start_irqsoff_tracer(tr);
 }
 
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               stop_irqsoff_tracer(tr);
+       stop_irqsoff_tracer(tr);
 }
 
-static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+static void irqsoff_tracer_start(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               start_irqsoff_tracer(tr);
-       else
-               stop_irqsoff_tracer(tr);
+       tracer_enabled = 1;
+       save_tracer_enabled = 1;
+}
+
+static void irqsoff_tracer_stop(struct trace_array *tr)
+{
+       tracer_enabled = 0;
+       save_tracer_enabled = 0;
 }
 
 static void irqsoff_tracer_open(struct trace_iterator *iter)
 {
        /* stop the trace while dumping */
-       if (iter->tr->ctrl)
-               stop_irqsoff_tracer(iter->tr);
+       tracer_enabled = 0;
 }
 
 static void irqsoff_tracer_close(struct trace_iterator *iter)
 {
-       if (iter->tr->ctrl)
-               start_irqsoff_tracer(iter->tr);
+       /* restart tracing */
+       tracer_enabled = save_tracer_enabled;
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
-static void irqsoff_tracer_init(struct trace_array *tr)
+static int irqsoff_tracer_init(struct trace_array *tr)
 {
        trace_type = TRACER_IRQS_OFF;
 
        __irqsoff_tracer_init(tr);
+       return 0;
 }
 static struct tracer irqsoff_tracer __read_mostly =
 {
        .name           = "irqsoff",
        .init           = irqsoff_tracer_init,
        .reset          = irqsoff_tracer_reset,
+       .start          = irqsoff_tracer_start,
+       .stop           = irqsoff_tracer_stop,
        .open           = irqsoff_tracer_open,
        .close          = irqsoff_tracer_close,
-       .ctrl_update    = irqsoff_tracer_ctrl_update,
        .print_max      = 1,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_irqsoff,
@@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
 #endif
 
 #ifdef CONFIG_PREEMPT_TRACER
-static void preemptoff_tracer_init(struct trace_array *tr)
+static int preemptoff_tracer_init(struct trace_array *tr)
 {
        trace_type = TRACER_PREEMPT_OFF;
 
        __irqsoff_tracer_init(tr);
+       return 0;
 }
 
 static struct tracer preemptoff_tracer __read_mostly =
@@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
        .name           = "preemptoff",
        .init           = preemptoff_tracer_init,
        .reset          = irqsoff_tracer_reset,
+       .start          = irqsoff_tracer_start,
+       .stop           = irqsoff_tracer_stop,
        .open           = irqsoff_tracer_open,
        .close          = irqsoff_tracer_close,
-       .ctrl_update    = irqsoff_tracer_ctrl_update,
        .print_max      = 1,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptoff,
@@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
 #if defined(CONFIG_IRQSOFF_TRACER) && \
        defined(CONFIG_PREEMPT_TRACER)
 
-static void preemptirqsoff_tracer_init(struct trace_array *tr)
+static int preemptirqsoff_tracer_init(struct trace_array *tr)
 {
        trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
 
        __irqsoff_tracer_init(tr);
+       return 0;
 }
 
 static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
        .name           = "preemptirqsoff",
        .init           = preemptirqsoff_tracer_init,
        .reset          = irqsoff_tracer_reset,
+       .start          = irqsoff_tracer_start,
+       .stop           = irqsoff_tracer_stop,
        .open           = irqsoff_tracer_open,
        .close          = irqsoff_tracer_close,
-       .ctrl_update    = irqsoff_tracer_ctrl_update,
        .print_max      = 1,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_preemptirqsoff,
index e62cbf78eab6d6a6270a2a5c97f4ffc7ac443525..2fb6da6523b31131aee6d7cea6351b85b3178d54 100644 (file)
@@ -32,34 +32,29 @@ static void mmio_reset_data(struct trace_array *tr)
                tracing_reset(tr, cpu);
 }
 
-static void mmio_trace_init(struct trace_array *tr)
+static int mmio_trace_init(struct trace_array *tr)
 {
        pr_debug("in %s\n", __func__);
        mmio_trace_array = tr;
-       if (tr->ctrl) {
-               mmio_reset_data(tr);
-               enable_mmiotrace();
-       }
+
+       mmio_reset_data(tr);
+       enable_mmiotrace();
+       return 0;
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
 {
        pr_debug("in %s\n", __func__);
-       if (tr->ctrl)
-               disable_mmiotrace();
+
+       disable_mmiotrace();
        mmio_reset_data(tr);
        mmio_trace_array = NULL;
 }
 
-static void mmio_trace_ctrl_update(struct trace_array *tr)
+static void mmio_trace_start(struct trace_array *tr)
 {
        pr_debug("in %s\n", __func__);
-       if (tr->ctrl) {
-               mmio_reset_data(tr);
-               enable_mmiotrace();
-       } else {
-               disable_mmiotrace();
-       }
+       mmio_reset_data(tr);
 }
 
 static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -296,10 +291,10 @@ static struct tracer mmio_tracer __read_mostly =
        .name           = "mmiotrace",
        .init           = mmio_trace_init,
        .reset          = mmio_trace_reset,
+       .start          = mmio_trace_start,
        .pipe_open      = mmio_pipe_open,
        .close          = mmio_close,
        .read           = mmio_read,
-       .ctrl_update    = mmio_trace_ctrl_update,
        .print_line     = mmio_print_line,
 };
 
@@ -371,5 +366,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 
 int mmio_trace_printk(const char *fmt, va_list args)
 {
-       return trace_vprintk(0, fmt, args);
+       return trace_vprintk(0, -1, fmt, args);
 }
index 4592b4862515c9d1680417f4fe46bc192999d661..b9767acd30acca0fcb6192b847888d1d32a6924a 100644 (file)
 
 #include "trace.h"
 
+/* Our two options */
+enum {
+       TRACE_NOP_OPT_ACCEPT = 0x1,
+       TRACE_NOP_OPT_REFUSE = 0x2
+};
+
+/* Options for the tracer (see trace_options file) */
+static struct tracer_opt nop_opts[] = {
+       /* Option that will be accepted by set_flag callback */
+       { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
+       /* Option that will be refused by set_flag callback */
+       { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
+       { } /* Always set a last empty entry */
+};
+
+static struct tracer_flags nop_flags = {
+       /* You can check your flags value here when you want. */
+       .val = 0, /* By default: all flags disabled */
+       .opts = nop_opts
+};
+
 static struct trace_array      *ctx_trace;
 
 static void start_nop_trace(struct trace_array *tr)
@@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr)
        /* Nothing to do! */
 }
 
-static void nop_trace_init(struct trace_array *tr)
+static int nop_trace_init(struct trace_array *tr)
 {
        int cpu;
        ctx_trace = tr;
@@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr)
        for_each_online_cpu(cpu)
                tracing_reset(tr, cpu);
 
-       if (tr->ctrl)
-               start_nop_trace(tr);
+       start_nop_trace(tr);
+       return 0;
 }
 
 static void nop_trace_reset(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               stop_nop_trace(tr);
+       stop_nop_trace(tr);
 }
 
-static void nop_trace_ctrl_update(struct trace_array *tr)
+/* It only serves as a signal handler and a callback to
+ * accept or refuse tthe setting of a flag.
+ * If you don't implement it, then the flag setting will be
+ * automatically accepted.
+ */
+static int nop_set_flag(u32 old_flags, u32 bit, int set)
 {
-       /* When starting a new trace, reset the buffers */
-       if (tr->ctrl)
-               start_nop_trace(tr);
-       else
-               stop_nop_trace(tr);
+       /*
+        * Note that you don't need to update nop_flags.val yourself.
+        * The tracing Api will do it automatically if you return 0
+        */
+       if (bit == TRACE_NOP_OPT_ACCEPT) {
+               printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
+                       " Now cat trace_options to see the result\n",
+                       set);
+               return 0;
+       }
+
+       if (bit == TRACE_NOP_OPT_REFUSE) {
+               printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
+                       "Now cat trace_options to see the result\n",
+                       set);
+               return -EINVAL;
+       }
+
+       return 0;
 }
 
+
 struct tracer nop_trace __read_mostly =
 {
        .name           = "nop",
        .init           = nop_trace_init,
        .reset          = nop_trace_reset,
-       .ctrl_update    = nop_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest       = trace_selftest_startup_nop,
 #endif
+       .flags          = &nop_flags,
+       .set_flag       = nop_set_flag
 };
 
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644 (file)
index 0000000..a7172a3
--- /dev/null
@@ -0,0 +1,179 @@
+/*
+ * ring buffer based C-state tracer
+ *
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * Much is borrowed from trace_boot.c which is
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+#include <linux/module.h>
+
+#include "trace.h"
+
+static struct trace_array *power_trace;
+static int __read_mostly trace_power_enabled;
+
+
+static void start_power_trace(struct trace_array *tr)
+{
+       trace_power_enabled = 1;
+}
+
+static void stop_power_trace(struct trace_array *tr)
+{
+       trace_power_enabled = 0;
+}
+
+
+static int power_trace_init(struct trace_array *tr)
+{
+       int cpu;
+       power_trace = tr;
+
+       trace_power_enabled = 1;
+
+       for_each_cpu_mask(cpu, cpu_possible_map)
+               tracing_reset(tr, cpu);
+       return 0;
+}
+
+static enum print_line_t power_print_line(struct trace_iterator *iter)
+{
+       int ret = 0;
+       struct trace_entry *entry = iter->ent;
+       struct trace_power *field ;
+       struct power_trace *it;
+       struct trace_seq *s = &iter->seq;
+       struct timespec stamp;
+       struct timespec duration;
+
+       trace_assign_type(field, entry);
+       it = &field->state_data;
+       stamp = ktime_to_timespec(it->stamp);
+       duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
+
+       if (entry->type == TRACE_POWER) {
+               if (it->type == POWER_CSTATE)
+                       ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
+                                         stamp.tv_sec,
+                                         stamp.tv_nsec,
+                                         it->state, iter->cpu,
+                                         duration.tv_sec,
+                                         duration.tv_nsec);
+               if (it->type == POWER_PSTATE)
+                       ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
+                                         stamp.tv_sec,
+                                         stamp.tv_nsec,
+                                         it->state, iter->cpu);
+               if (!ret)
+                       return TRACE_TYPE_PARTIAL_LINE;
+               return TRACE_TYPE_HANDLED;
+       }
+       return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer power_tracer __read_mostly =
+{
+       .name           = "power",
+       .init           = power_trace_init,
+       .start          = start_power_trace,
+       .stop           = stop_power_trace,
+       .reset          = stop_power_trace,
+       .print_line     = power_print_line,
+};
+
+static int init_power_trace(void)
+{
+       return register_tracer(&power_tracer);
+}
+device_initcall(init_power_trace);
+
+void trace_power_start(struct power_trace *it, unsigned int type,
+                        unsigned int level)
+{
+       if (!trace_power_enabled)
+               return;
+
+       memset(it, 0, sizeof(struct power_trace));
+       it->state = level;
+       it->type = type;
+       it->stamp = ktime_get();
+}
+EXPORT_SYMBOL_GPL(trace_power_start);
+
+
+void trace_power_end(struct power_trace *it)
+{
+       struct ring_buffer_event *event;
+       struct trace_power *entry;
+       struct trace_array_cpu *data;
+       unsigned long irq_flags;
+       struct trace_array *tr = power_trace;
+
+       if (!trace_power_enabled)
+               return;
+
+       preempt_disable();
+       it->end = ktime_get();
+       data = tr->data[smp_processor_id()];
+
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               goto out;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, 0, 0);
+       entry->ent.type = TRACE_POWER;
+       entry->state_data = *it;
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+       trace_wake_up();
+
+ out:
+       preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_end);
+
+void trace_power_mark(struct power_trace *it, unsigned int type,
+                        unsigned int level)
+{
+       struct ring_buffer_event *event;
+       struct trace_power *entry;
+       struct trace_array_cpu *data;
+       unsigned long irq_flags;
+       struct trace_array *tr = power_trace;
+
+       if (!trace_power_enabled)
+               return;
+
+       memset(it, 0, sizeof(struct power_trace));
+       it->state = level;
+       it->type = type;
+       it->stamp = ktime_get();
+       preempt_disable();
+       it->end = it->stamp;
+       data = tr->data[smp_processor_id()];
+
+       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+                                        &irq_flags);
+       if (!event)
+               goto out;
+       entry   = ring_buffer_event_data(event);
+       tracing_generic_entry_update(&entry->ent, 0, 0);
+       entry->ent.type = TRACE_POWER;
+       entry->state_data = *it;
+       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+       trace_wake_up();
+
+ out:
+       preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_mark);
index b8f56beb1a621d5ff527a93aee383f0e02fd30dd..863390557b445d88596dde1b452ffb2e7e3e4ec8 100644 (file)
@@ -16,7 +16,8 @@
 
 static struct trace_array      *ctx_trace;
 static int __read_mostly       tracer_enabled;
-static atomic_t                        sched_ref;
+static int                     sched_ref;
+static DEFINE_MUTEX(sched_register_mutex);
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
        int cpu;
        int pc;
 
-       if (!atomic_read(&sched_ref))
+       if (!sched_ref)
                return;
 
        tracing_record_cmdline(prev);
@@ -123,20 +124,18 @@ static void tracing_sched_unregister(void)
 
 static void tracing_start_sched_switch(void)
 {
-       long ref;
-
-       ref = atomic_inc_return(&sched_ref);
-       if (ref == 1)
+       mutex_lock(&sched_register_mutex);
+       if (!(sched_ref++))
                tracing_sched_register();
+       mutex_unlock(&sched_register_mutex);
 }
 
 static void tracing_stop_sched_switch(void)
 {
-       long ref;
-
-       ref = atomic_dec_and_test(&sched_ref);
-       if (ref)
+       mutex_lock(&sched_register_mutex);
+       if (!(--sched_ref))
                tracing_sched_unregister();
+       mutex_unlock(&sched_register_mutex);
 }
 
 void tracing_start_cmdline_record(void)
@@ -149,40 +148,86 @@ void tracing_stop_cmdline_record(void)
        tracing_stop_sched_switch();
 }
 
+/**
+ * tracing_start_sched_switch_record - start tracing context switches
+ *
+ * Turns on context switch tracing for a tracer.
+ */
+void tracing_start_sched_switch_record(void)
+{
+       if (unlikely(!ctx_trace)) {
+               WARN_ON(1);
+               return;
+       }
+
+       tracing_start_sched_switch();
+
+       mutex_lock(&sched_register_mutex);
+       tracer_enabled++;
+       mutex_unlock(&sched_register_mutex);
+}
+
+/**
+ * tracing_stop_sched_switch_record - start tracing context switches
+ *
+ * Turns off context switch tracing for a tracer.
+ */
+void tracing_stop_sched_switch_record(void)
+{
+       mutex_lock(&sched_register_mutex);
+       tracer_enabled--;
+       WARN_ON(tracer_enabled < 0);
+       mutex_unlock(&sched_register_mutex);
+
+       tracing_stop_sched_switch();
+}
+
+/**
+ * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
+ * @tr: trace array pointer to assign
+ *
+ * Some tracers might want to record the context switches in their
+ * trace. This function lets those tracers assign the trace array
+ * to use.
+ */
+void tracing_sched_switch_assign_trace(struct trace_array *tr)
+{
+       ctx_trace = tr;
+}
+
 static void start_sched_trace(struct trace_array *tr)
 {
        sched_switch_reset(tr);
-       tracing_start_cmdline_record();
-       tracer_enabled = 1;
+       tracing_start_sched_switch_record();
 }
 
 static void stop_sched_trace(struct trace_array *tr)
 {
-       tracer_enabled = 0;
-       tracing_stop_cmdline_record();
+       tracing_stop_sched_switch_record();
 }
 
-static void sched_switch_trace_init(struct trace_array *tr)
+static int sched_switch_trace_init(struct trace_array *tr)
 {
        ctx_trace = tr;
-
-       if (tr->ctrl)
-               start_sched_trace(tr);
+       start_sched_trace(tr);
+       return 0;
 }
 
 static void sched_switch_trace_reset(struct trace_array *tr)
 {
-       if (tr->ctrl)
+       if (sched_ref)
                stop_sched_trace(tr);
 }
 
-static void sched_switch_trace_ctrl_update(struct trace_array *tr)
+static void sched_switch_trace_start(struct trace_array *tr)
 {
-       /* When starting a new trace, reset the buffers */
-       if (tr->ctrl)
-               start_sched_trace(tr);
-       else
-               stop_sched_trace(tr);
+       sched_switch_reset(tr);
+       tracing_start_sched_switch();
+}
+
+static void sched_switch_trace_stop(struct trace_array *tr)
+{
+       tracing_stop_sched_switch();
 }
 
 static struct tracer sched_switch_trace __read_mostly =
@@ -190,7 +235,8 @@ static struct tracer sched_switch_trace __read_mostly =
        .name           = "sched_switch",
        .init           = sched_switch_trace_init,
        .reset          = sched_switch_trace_reset,
-       .ctrl_update    = sched_switch_trace_ctrl_update,
+       .start          = sched_switch_trace_start,
+       .stop           = sched_switch_trace_stop,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_sched_switch,
 #endif
@@ -198,14 +244,6 @@ static struct tracer sched_switch_trace __read_mostly =
 
 __init static int init_sched_switch_trace(void)
 {
-       int ret = 0;
-
-       if (atomic_read(&sched_ref))
-               ret = tracing_sched_register();
-       if (ret) {
-               pr_info("error registering scheduler trace\n");
-               return ret;
-       }
        return register_tracer(&sched_switch_trace);
 }
 device_initcall(init_sched_switch_trace);
index 3ae93f16b565de131887da94ee10835c183b2d6a..0067b49746c1d6b8c8a121d90560fad4f52b78d7 100644 (file)
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
                return;
 
        pc = preempt_count();
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
 
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
  out:
        atomic_dec(&data->disabled);
 
-       /*
-        * To prevent recursion from the scheduler, if the
-        * resched flag was set before we entered, then
-        * don't reschedule.
-        */
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
@@ -271,6 +262,12 @@ out:
        atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
 
+/*
+ * save_tracer_enabled is used to save the state of the tracer_enabled
+ * variable when we disable it when we open a trace output file.
+ */
+static int save_tracer_enabled;
+
 static void start_wakeup_tracer(struct trace_array *tr)
 {
        int ret;
@@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
        register_ftrace_function(&trace_ops);
 
-       tracer_enabled = 1;
+       if (tracing_is_enabled()) {
+               tracer_enabled = 1;
+               save_tracer_enabled = 1;
+       } else {
+               tracer_enabled = 0;
+               save_tracer_enabled = 0;
+       }
 
        return;
 fail_deprobe_wake_new:
@@ -321,49 +324,53 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
        tracer_enabled = 0;
+       save_tracer_enabled = 0;
        unregister_ftrace_function(&trace_ops);
        unregister_trace_sched_switch(probe_wakeup_sched_switch);
        unregister_trace_sched_wakeup_new(probe_wakeup);
        unregister_trace_sched_wakeup(probe_wakeup);
 }
 
-static void wakeup_tracer_init(struct trace_array *tr)
+static int wakeup_tracer_init(struct trace_array *tr)
 {
        wakeup_trace = tr;
-
-       if (tr->ctrl)
-               start_wakeup_tracer(tr);
+       start_wakeup_tracer(tr);
+       return 0;
 }
 
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
-       if (tr->ctrl) {
-               stop_wakeup_tracer(tr);
-               /* make sure we put back any tasks we are tracing */
-               wakeup_reset(tr);
-       }
+       stop_wakeup_tracer(tr);
+       /* make sure we put back any tasks we are tracing */
+       wakeup_reset(tr);
+}
+
+static void wakeup_tracer_start(struct trace_array *tr)
+{
+       wakeup_reset(tr);
+       tracer_enabled = 1;
+       save_tracer_enabled = 1;
 }
 
-static void wakeup_tracer_ctrl_update(struct trace_array *tr)
+static void wakeup_tracer_stop(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               start_wakeup_tracer(tr);
-       else
-               stop_wakeup_tracer(tr);
+       tracer_enabled = 0;
+       save_tracer_enabled = 0;
 }
 
 static void wakeup_tracer_open(struct trace_iterator *iter)
 {
        /* stop the trace while dumping */
-       if (iter->tr->ctrl)
-               stop_wakeup_tracer(iter->tr);
+       tracer_enabled = 0;
 }
 
 static void wakeup_tracer_close(struct trace_iterator *iter)
 {
        /* forget about any processes we were recording */
-       if (iter->tr->ctrl)
-               start_wakeup_tracer(iter->tr);
+       if (save_tracer_enabled) {
+               wakeup_reset(iter->tr);
+               tracer_enabled = 1;
+       }
 }
 
 static struct tracer wakeup_tracer __read_mostly =
@@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
        .name           = "wakeup",
        .init           = wakeup_tracer_init,
        .reset          = wakeup_tracer_reset,
+       .start          = wakeup_tracer_start,
+       .stop           = wakeup_tracer_stop,
        .open           = wakeup_tracer_open,
        .close          = wakeup_tracer_close,
-       .ctrl_update    = wakeup_tracer_ctrl_update,
        .print_max      = 1,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_wakeup,
index 90bc752a7580b3d4800417d2f7a28abe604c1d65..88c8eb70f54aeb3508dda9c668f082b9a43b0c78 100644 (file)
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_STACK:
        case TRACE_PRINT:
        case TRACE_SPECIAL:
+       case TRACE_BRANCH:
                return 1;
        }
        return 0;
@@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
        int cpu, ret = 0;
 
        /* Don't allow flipping of max traces now */
-       raw_local_irq_save(flags);
+       local_irq_save(flags);
        __raw_spin_lock(&ftrace_max_lock);
 
        cnt = ring_buffer_entries(tr->buffer);
@@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
                        break;
        }
        __raw_spin_unlock(&ftrace_max_lock);
-       raw_local_irq_restore(flags);
+       local_irq_restore(flags);
 
        if (count)
                *count = cnt;
@@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
        return ret;
 }
 
+static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
+{
+       printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
+               trace->name, init_ret);
+}
 #ifdef CONFIG_FUNCTION_TRACER
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        ftrace_set_filter(func_name, strlen(func_name), 1);
 
        /* enable tracing */
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               goto out;
+       }
 
        /* Sleep for a 1/10 of a second */
        msleep(100);
@@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
        msleep(100);
 
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        ftrace_enabled = 0;
 
        /* check the trace buffer */
        ret = trace_test_buffer(tr, &count);
        trace->reset(tr);
+       tracing_start();
 
        /* we should only have one item */
        if (!ret && count != 1) {
@@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
                ret = -1;
                goto out;
        }
+
  out:
        ftrace_enabled = save_ftrace_enabled;
        tracer_enabled = save_tracer_enabled;
@@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
        ftrace_enabled = 1;
        tracer_enabled = 1;
 
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               goto out;
+       }
+
        /* Sleep for a 1/10 of a second */
        msleep(100);
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        ftrace_enabled = 0;
 
        /* check the trace buffer */
        ret = trace_test_buffer(tr, &count);
        trace->reset(tr);
+       tracing_start();
 
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
@@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
        int ret;
 
        /* start the tracing */
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
        /* reset the max latency */
        tracing_max_latency = 0;
        /* disable interrupts for a bit */
@@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
        udelay(100);
        local_irq_enable();
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(tr, NULL);
        if (!ret)
                ret = trace_test_buffer(&max_tr, &count);
        trace->reset(tr);
+       tracing_start();
 
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
@@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
        unsigned long count;
        int ret;
 
+       /*
+        * Now that the big kernel lock is no longer preemptable,
+        * and this is called with the BKL held, it will always
+        * fail. If preemption is already disabled, simply
+        * pass the test. When the BKL is removed, or becomes
+        * preemptible again, we will once again test this,
+        * so keep it in.
+        */
+       if (preempt_count()) {
+               printk(KERN_CONT "can not test ... force ");
+               return 0;
+       }
+
        /* start the tracing */
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
        /* reset the max latency */
        tracing_max_latency = 0;
        /* disable preemption for a bit */
@@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
        udelay(100);
        preempt_enable();
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(tr, NULL);
        if (!ret)
                ret = trace_test_buffer(&max_tr, &count);
        trace->reset(tr);
+       tracing_start();
 
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
@@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        unsigned long count;
        int ret;
 
+       /*
+        * Now that the big kernel lock is no longer preemptable,
+        * and this is called with the BKL held, it will always
+        * fail. If preemption is already disabled, simply
+        * pass the test. When the BKL is removed, or becomes
+        * preemptible again, we will once again test this,
+        * so keep it in.
+        */
+       if (preempt_count()) {
+               printk(KERN_CONT "can not test ... force ");
+               return 0;
+       }
+
        /* start the tracing */
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               goto out;
+       }
 
        /* reset the max latency */
        tracing_max_latency = 0;
@@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        local_irq_enable();
 
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(tr, NULL);
-       if (ret)
+       if (ret) {
+               tracing_start();
                goto out;
+       }
 
        ret = trace_test_buffer(&max_tr, &count);
-       if (ret)
+       if (ret) {
+               tracing_start();
                goto out;
+       }
 
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
                ret = -1;
+               tracing_start();
                goto out;
        }
 
        /* do the test by disabling interrupts first this time */
        tracing_max_latency = 0;
-       tr->ctrl = 1;
-       trace->ctrl_update(tr);
+       tracing_start();
        preempt_disable();
        local_irq_disable();
        udelay(100);
@@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
        local_irq_enable();
 
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(tr, NULL);
        if (ret)
@@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 
  out:
        trace->reset(tr);
+       tracing_start();
        tracing_max_latency = save_max;
 
        return ret;
@@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        wait_for_completion(&isrt);
 
        /* start the tracing */
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
        /* reset the max latency */
        tracing_max_latency = 0;
 
@@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
        msleep(100);
 
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        /* check both trace buffers */
        ret = trace_test_buffer(tr, NULL);
        if (!ret)
@@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 
 
        trace->reset(tr);
+       tracing_start();
 
        tracing_max_latency = save_max;
 
@@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
        int ret;
 
        /* start the tracing */
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
        /* Sleep for a 1/10 of a second */
        msleep(100);
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        /* check the trace buffer */
        ret = trace_test_buffer(tr, &count);
        trace->reset(tr);
+       tracing_start();
 
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
@@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
        int ret;
 
        /* start the tracing */
-       tr->ctrl = 1;
-       trace->init(tr);
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return 0;
+       }
+
        /* Sleep for a 1/10 of a second */
        msleep(100);
        /* stop the tracing. */
-       tr->ctrl = 0;
-       trace->ctrl_update(tr);
+       tracing_stop();
        /* check the trace buffer */
        ret = trace_test_buffer(tr, &count);
        trace->reset(tr);
+       tracing_start();
 
        return ret;
 }
 #endif /* CONFIG_SYSPROF_TRACER */
+
+#ifdef CONFIG_BRANCH_TRACER
+int
+trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
+{
+       unsigned long count;
+       int ret;
+
+       /* start the tracing */
+       ret = trace->init(tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
+       /* Sleep for a 1/10 of a second */
+       msleep(100);
+       /* stop the tracing. */
+       tracing_stop();
+       /* check the trace buffer */
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+       tracing_start();
+
+       return ret;
+}
+#endif /* CONFIG_BRANCH_TRACER */
index 3bdb44bde4b7e5fa4af698c467f0c64b704e7699..0b863f2cbc8e3ff04c9ea8347f4ffb6795dab446 100644 (file)
@@ -48,7 +48,7 @@ static inline void check_stack(void)
        if (!object_is_on_stack(&this_size))
                return;
 
-       raw_local_irq_save(flags);
+       local_irq_save(flags);
        __raw_spin_lock(&max_stack_lock);
 
        /* a race could have already updated it */
@@ -78,6 +78,7 @@ static inline void check_stack(void)
         * on a new max, so it is far from a fast path.
         */
        while (i < max_stack_trace.nr_entries) {
+               int found = 0;
 
                stack_dump_index[i] = this_size;
                p = start;
@@ -86,17 +87,19 @@ static inline void check_stack(void)
                        if (*p == stack_dump_trace[i]) {
                                this_size = stack_dump_index[i++] =
                                        (top - p) * sizeof(unsigned long);
+                               found = 1;
                                /* Start the search from here */
                                start = p + 1;
                        }
                }
 
-               i++;
+               if (!found)
+                       i++;
        }
 
  out:
        __raw_spin_unlock(&max_stack_lock);
-       raw_local_irq_restore(flags);
+       local_irq_restore(flags);
 }
 
 static void
@@ -107,8 +110,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
        if (unlikely(!ftrace_enabled || stack_trace_disabled))
                return;
 
-       resched = need_resched();
-       preempt_disable_notrace();
+       resched = ftrace_preempt_disable();
 
        cpu = raw_smp_processor_id();
        /* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +122,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
  out:
        per_cpu(trace_active, cpu)--;
        /* prevent recursion in schedule */
-       if (resched)
-               preempt_enable_no_resched_notrace();
-       else
-               preempt_enable_notrace();
+       ftrace_preempt_enable(resched);
 }
 
 static struct ftrace_ops trace_ops __read_mostly =
@@ -166,11 +165,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
        if (ret < 0)
                return ret;
 
-       raw_local_irq_save(flags);
+       local_irq_save(flags);
        __raw_spin_lock(&max_stack_lock);
        *ptr = val;
        __raw_spin_unlock(&max_stack_lock);
-       raw_local_irq_restore(flags);
+       local_irq_restore(flags);
 
        return count;
 }
index 9587d3bcba556761de49854c95676a03a6dcecbf..54960edb96d077d2c09a4356f7976ae8d0216781 100644 (file)
@@ -261,27 +261,17 @@ static void stop_stack_trace(struct trace_array *tr)
        mutex_unlock(&sample_timer_lock);
 }
 
-static void stack_trace_init(struct trace_array *tr)
+static int stack_trace_init(struct trace_array *tr)
 {
        sysprof_trace = tr;
 
-       if (tr->ctrl)
-               start_stack_trace(tr);
+       start_stack_trace(tr);
+       return 0;
 }
 
 static void stack_trace_reset(struct trace_array *tr)
 {
-       if (tr->ctrl)
-               stop_stack_trace(tr);
-}
-
-static void stack_trace_ctrl_update(struct trace_array *tr)
-{
-       /* When starting a new trace, reset the buffers */
-       if (tr->ctrl)
-               start_stack_trace(tr);
-       else
-               stop_stack_trace(tr);
+       stop_stack_trace(tr);
 }
 
 static struct tracer stack_trace __read_mostly =
@@ -289,7 +279,6 @@ static struct tracer stack_trace __read_mostly =
        .name           = "sysprof",
        .init           = stack_trace_init,
        .reset          = stack_trace_reset,
-       .ctrl_update    = stack_trace_ctrl_update,
 #ifdef CONFIG_FTRACE_SELFTEST
        .selftest    = trace_selftest_startup_sysprof,
 #endif
index af8c85664882c992d962c0ff8809f889ccadd65d..79602740bbb5f396278dbe1665eada4c6f0259f0 100644 (file)
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
  */
 #define TRACEPOINT_HASH_BITS 6
 #define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
+static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
 
 /*
  * Note about RCU :
@@ -54,40 +55,43 @@ struct tracepoint_entry {
        struct hlist_node hlist;
        void **funcs;
        int refcount;   /* Number of times armed. 0 if disarmed. */
-       struct rcu_head rcu;
-       void *oldptr;
-       unsigned char rcu_pending:1;
        char name[0];
 };
 
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+struct tp_probes {
+       union {
+               struct rcu_head rcu;
+               struct list_head list;
+       } u;
+       void *probes[0];
+};
 
-static void free_old_closure(struct rcu_head *head)
+static inline void *allocate_probes(int count)
 {
-       struct tracepoint_entry *entry = container_of(head,
-               struct tracepoint_entry, rcu);
-       kfree(entry->oldptr);
-       /* Make sure we free the data before setting the pending flag to 0 */
-       smp_wmb();
-       entry->rcu_pending = 0;
+       struct tp_probes *p  = kmalloc(count * sizeof(void *)
+                       + sizeof(struct tp_probes), GFP_KERNEL);
+       return p == NULL ? NULL : p->probes;
 }
 
-static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
+static void rcu_free_old_probes(struct rcu_head *head)
 {
-       if (!old)
-               return;
-       entry->oldptr = old;
-       entry->rcu_pending = 1;
-       /* write rcu_pending before calling the RCU callback */
-       smp_wmb();
-       call_rcu_sched(&entry->rcu, free_old_closure);
+       kfree(container_of(head, struct tp_probes, u.rcu));
+}
+
+static inline void release_probes(void *old)
+{
+       if (old) {
+               struct tp_probes *tp_probes = container_of(old,
+                       struct tp_probes, probes[0]);
+               call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
+       }
 }
 
 static void debug_print_probes(struct tracepoint_entry *entry)
 {
        int i;
 
-       if (!tracepoint_debug)
+       if (!tracepoint_debug || !entry->funcs)
                return;
 
        for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
                                return ERR_PTR(-EEXIST);
        }
        /* + 2 : one for new probe, one for NULL func */
-       new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
+       new = allocate_probes(nr_probes + 2);
        if (new == NULL)
                return ERR_PTR(-ENOMEM);
        if (old)
                memcpy(new, old, nr_probes * sizeof(void *));
        new[nr_probes] = probe;
+       new[nr_probes + 1] = NULL;
        entry->refcount = nr_probes + 1;
        entry->funcs = new;
        debug_print_probes(entry);
@@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
        old = entry->funcs;
 
        if (!old)
-               return NULL;
+               return ERR_PTR(-ENOENT);
 
        debug_print_probes(entry);
        /* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
                int j = 0;
                /* N -> M, (N > 1, M > 0) */
                /* + 1 for NULL */
-               new = kzalloc((nr_probes - nr_del + 1)
-                       * sizeof(void *), GFP_KERNEL);
+               new = allocate_probes(nr_probes - nr_del + 1);
                if (new == NULL)
                        return ERR_PTR(-ENOMEM);
                for (i = 0; old[i]; i++)
                        if ((probe && old[i] != probe))
                                new[j++] = old[i];
+               new[nr_probes - nr_del] = NULL;
                entry->refcount = nr_probes - nr_del;
                entry->funcs = new;
        }
@@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
        memcpy(&e->name[0], name, name_len);
        e->funcs = NULL;
        e->refcount = 0;
-       e->rcu_pending = 0;
        hlist_add_head(&e->hlist, head);
        return e;
 }
@@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
  * Remove the tracepoint from the tracepoint hash table. Must be called with
  * mutex_lock held.
  */
-static int remove_tracepoint(const char *name)
+static inline void remove_tracepoint(struct tracepoint_entry *e)
 {
-       struct hlist_head *head;
-       struct hlist_node *node;
-       struct tracepoint_entry *e;
-       int found = 0;
-       size_t len = strlen(name) + 1;
-       u32 hash = jhash(name, len-1, 0);
-
-       head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-       hlist_for_each_entry(e, node, head, hlist) {
-               if (!strcmp(name, e->name)) {
-                       found = 1;
-                       break;
-               }
-       }
-       if (!found)
-               return -ENOENT;
-       if (e->refcount)
-               return -EBUSY;
        hlist_del(&e->hlist);
-       /* Make sure the call_rcu_sched has been executed */
-       if (e->rcu_pending)
-               rcu_barrier_sched();
        kfree(e);
-       return 0;
 }
 
 /*
@@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 static void disable_tracepoint(struct tracepoint *elem)
 {
        elem->state = 0;
+       rcu_assign_pointer(elem->funcs, NULL);
 }
 
 /**
@@ -320,6 +303,23 @@ static void tracepoint_update_probes(void)
        module_update_tracepoints();
 }
 
+static void *tracepoint_add_probe(const char *name, void *probe)
+{
+       struct tracepoint_entry *entry;
+       void *old;
+
+       entry = get_tracepoint(name);
+       if (!entry) {
+               entry = add_tracepoint(name);
+               if (IS_ERR(entry))
+                       return entry;
+       }
+       old = tracepoint_entry_add_probe(entry, probe);
+       if (IS_ERR(old) && !entry->refcount)
+               remove_tracepoint(entry);
+       return old;
+}
+
 /**
  * tracepoint_probe_register -  Connect a probe to a tracepoint
  * @name: tracepoint name
@@ -330,44 +330,36 @@ static void tracepoint_update_probes(void)
  */
 int tracepoint_probe_register(const char *name, void *probe)
 {
-       struct tracepoint_entry *entry;
-       int ret = 0;
        void *old;
 
        mutex_lock(&tracepoints_mutex);
-       entry = get_tracepoint(name);
-       if (!entry) {
-               entry = add_tracepoint(name);
-               if (IS_ERR(entry)) {
-                       ret = PTR_ERR(entry);
-                       goto end;
-               }
-       }
-       /*
-        * If we detect that a call_rcu_sched is pending for this tracepoint,
-        * make sure it's executed now.
-        */
-       if (entry->rcu_pending)
-               rcu_barrier_sched();
-       old = tracepoint_entry_add_probe(entry, probe);
-       if (IS_ERR(old)) {
-               ret = PTR_ERR(old);
-               goto end;
-       }
+       old = tracepoint_add_probe(name, probe);
        mutex_unlock(&tracepoints_mutex);
+       if (IS_ERR(old))
+               return PTR_ERR(old);
+
        tracepoint_update_probes();             /* may update entry */
-       mutex_lock(&tracepoints_mutex);
-       entry = get_tracepoint(name);
-       WARN_ON(!entry);
-       if (entry->rcu_pending)
-               rcu_barrier_sched();
-       tracepoint_entry_free_old(entry, old);
-end:
-       mutex_unlock(&tracepoints_mutex);
-       return ret;
+       release_probes(old);
+       return 0;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
+static void *tracepoint_remove_probe(const char *name, void *probe)
+{
+       struct tracepoint_entry *entry;
+       void *old;
+
+       entry = get_tracepoint(name);
+       if (!entry)
+               return ERR_PTR(-ENOENT);
+       old = tracepoint_entry_remove_probe(entry, probe);
+       if (IS_ERR(old))
+               return old;
+       if (!entry->refcount)
+               remove_tracepoint(entry);
+       return old;
+}
+
 /**
  * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
  * @name: tracepoint name
@@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
  */
 int tracepoint_probe_unregister(const char *name, void *probe)
 {
-       struct tracepoint_entry *entry;
        void *old;
-       int ret = -ENOENT;
 
        mutex_lock(&tracepoints_mutex);
-       entry = get_tracepoint(name);
-       if (!entry)
-               goto end;
-       if (entry->rcu_pending)
-               rcu_barrier_sched();
-       old = tracepoint_entry_remove_probe(entry, probe);
-       if (!old) {
-               printk(KERN_WARNING "Warning: Trying to unregister a probe"
-                                   "that doesn't exist\n");
-               goto end;
-       }
+       old = tracepoint_remove_probe(name, probe);
        mutex_unlock(&tracepoints_mutex);
+       if (IS_ERR(old))
+               return PTR_ERR(old);
+
        tracepoint_update_probes();             /* may update entry */
+       release_probes(old);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+
+static LIST_HEAD(old_probes);
+static int need_update;
+
+static void tracepoint_add_old_probes(void *old)
+{
+       need_update = 1;
+       if (old) {
+               struct tp_probes *tp_probes = container_of(old,
+                       struct tp_probes, probes[0]);
+               list_add(&tp_probes->u.list, &old_probes);
+       }
+}
+
+/**
+ * tracepoint_probe_register_noupdate -  register a probe but not connect
+ * @name: tracepoint name
+ * @probe: probe handler
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_register_noupdate(const char *name, void *probe)
+{
+       void *old;
+
        mutex_lock(&tracepoints_mutex);
-       entry = get_tracepoint(name);
-       if (!entry)
-               goto end;
-       if (entry->rcu_pending)
-               rcu_barrier_sched();
-       tracepoint_entry_free_old(entry, old);
-       remove_tracepoint(name);        /* Ignore busy error message */
-       ret = 0;
-end:
+       old = tracepoint_add_probe(name, probe);
+       if (IS_ERR(old)) {
+               mutex_unlock(&tracepoints_mutex);
+               return PTR_ERR(old);
+       }
+       tracepoint_add_old_probes(old);
        mutex_unlock(&tracepoints_mutex);
-       return ret;
+       return 0;
 }
-EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
+
+/**
+ * tracepoint_probe_unregister_noupdate -  remove a probe but not disconnect
+ * @name: tracepoint name
+ * @probe: probe function pointer
+ *
+ * caller must call tracepoint_probe_update_all()
+ */
+int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
+{
+       void *old;
+
+       mutex_lock(&tracepoints_mutex);
+       old = tracepoint_remove_probe(name, probe);
+       if (IS_ERR(old)) {
+               mutex_unlock(&tracepoints_mutex);
+               return PTR_ERR(old);
+       }
+       tracepoint_add_old_probes(old);
+       mutex_unlock(&tracepoints_mutex);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
+
+/**
+ * tracepoint_probe_update_all -  update tracepoints
+ */
+void tracepoint_probe_update_all(void)
+{
+       LIST_HEAD(release_probes);
+       struct tp_probes *pos, *next;
+
+       mutex_lock(&tracepoints_mutex);
+       if (!need_update) {
+               mutex_unlock(&tracepoints_mutex);
+               return;
+       }
+       if (!list_empty(&old_probes))
+               list_replace_init(&old_probes, &release_probes);
+       need_update = 0;
+       mutex_unlock(&tracepoints_mutex);
+
+       tracepoint_update_probes();
+       list_for_each_entry_safe(pos, next, &release_probes, u.list) {
+               list_del(&pos->u.list);
+               call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
+       }
+}
+EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
 
 /**
  * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
@@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
        iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
+
+#ifdef CONFIG_MODULES
+
+int tracepoint_module_notify(struct notifier_block *self,
+                            unsigned long val, void *data)
+{
+       struct module *mod = data;
+
+       switch (val) {
+       case MODULE_STATE_COMING:
+               tracepoint_update_probe_range(mod->tracepoints,
+                       mod->tracepoints + mod->num_tracepoints);
+               break;
+       case MODULE_STATE_GOING:
+               tracepoint_update_probe_range(mod->tracepoints,
+                       mod->tracepoints + mod->num_tracepoints);
+               break;
+       }
+       return 0;
+}
+
+struct notifier_block tracepoint_module_nb = {
+       .notifier_call = tracepoint_module_notify,
+       .priority = 0,
+};
+
+static int init_tracepoints(void)
+{
+       return register_module_notifier(&tracepoint_module_nb);
+}
+__initcall(init_tracepoints);
+
+#endif /* CONFIG_MODULES */
index 39d6159fae430cf60811839f4e2dbd20aadb9e4a..cec2224bc9f5249293c4b2a15a2ad27416c45735 100644 (file)
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
        if (IS_ERR(up->tg))
                rc = -ENOMEM;
 
+       set_tg_uid(up);
+
        return rc;
 }
 
index 85cf7ea978aafe6606e6e44cc634a82e6cf29477..7823f8342abf1c8f403a251b56d428dc9bf703a5 100644 (file)
@@ -157,4 +157,11 @@ config CHECK_SIGNATURE
 config HAVE_LMB
        boolean
 
+config CPUMASK_OFFSTACK
+       bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+       help
+         Use dynamic allocation for cpumask_var_t, instead of putting
+         them on the stack.  This is a bit more expensive, but avoids
+         stack overflow.
+
 endmenu
index b0f239e443bc0fbb11a27ee98dbaf4e641d21971..1e3fd3e3436abf6e142c9f0b46cc6993cd19ec38 100644 (file)
@@ -545,6 +545,16 @@ config DEBUG_SG
 
          If unsure, say N.
 
+config DEBUG_NOTIFIERS
+       bool "Debug notifier call chains"
+       depends on DEBUG_KERNEL
+       help
+         Enable this to turn on sanity checking for notifier call chains.
+         This is most useful for kernel developers to make sure that
+         modules properly unregister themselves from notifier chains.
+         This is a relatively cheap check but if you care about maximum
+         performance, say N.
+
 config FRAME_POINTER
        bool "Compile the kernel with frame pointers"
        depends on DEBUG_KERNEL && \
index 06722c4030584382478d7c447ad35f96fe856dd6..bf0cf7c8387b8d92c93dfb18436a7f2cf424cb00 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
+#include <trace/block.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE      64
@@ -21,6 +22,8 @@
 
 static mempool_t *page_pool, *isa_page_pool;
 
+DEFINE_TRACE(block_bio_bounce);
+
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
@@ -222,7 +225,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
        if (!bio)
                return;
 
-       blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+       trace_block_bio_bounce(q, *bio_orig);
 
        /*
         * at least one page was bounced, fill in possible non-highmem
index 164951c473058a25c081d5e47260d872068cdbb7..fc031d68327e5fad33130b15d50a020b7b9b31a8 100644 (file)
@@ -3049,3 +3049,18 @@ void print_vma_addr(char *prefix, unsigned long ip)
        }
        up_read(&current->mm->mmap_sem);
 }
+
+#ifdef CONFIG_PROVE_LOCKING
+void might_fault(void)
+{
+       might_sleep();
+       /*
+        * it would be nicer only to annotate paths which are not under
+        * pagefault_disable, however that requires a larger audit and
+        * providing helpers like get_user_atomic.
+        */
+       if (!in_atomic() && current->mm)
+               might_lock_read(&current->mm->mmap_sem);
+}
+EXPORT_SYMBOL(might_fault);
+#endif
index a2cd47d89e0aa1f159d8e9e6ed2dcee32969068f..8e516e29f98920f65e8da441e5d2cc51e4cf4f97 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3626,7 +3626,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                                len < PAGE_SIZE - 60) {
                        len += sprintf(buf + len, " cpus=");
                        len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
-                                       l->cpus);
+                                       &l->cpus);
                }
 
                if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
index 0216b55bd64075a462d3d32c6abc77a5cdadca4c..01724e04c556339762a272ecff36ec4aa23145d4 100644 (file)
@@ -4,10 +4,10 @@
 #include <linux/proc_fs.h>     /* for struct inode and struct file */
 #include <linux/tracepoint.h>
 
-DEFINE_TRACE(subsys_event,
+DECLARE_TRACE(subsys_event,
        TPPROTO(struct inode *inode, struct file *file),
        TPARGS(inode, file));
-DEFINE_TRACE(subsys_eventb,
+DECLARE_TRACE(subsys_eventb,
        TPPROTO(void),
        TPARGS());
 #endif
index 55abfdda4bd4eeedcf06b034dc7f09ac0688f26e..e3a964889dc7951565b3f45497e0e123762ebab5 100644 (file)
@@ -46,6 +46,7 @@ void __exit tp_sample_trace_exit(void)
 {
        unregister_trace_subsys_eventb(probe_subsys_eventb);
        unregister_trace_subsys_event(probe_subsys_event);
+       tracepoint_synchronize_unregister();
 }
 
 module_exit(tp_sample_trace_exit);
index 5e9fcf4afffeca97bd30befd00f881bc3b28064f..685a5acb456275dd5459891caaa9f29abbdfa2d2 100644 (file)
@@ -33,6 +33,7 @@ module_init(tp_sample_trace_init);
 void __exit tp_sample_trace_exit(void)
 {
        unregister_trace_subsys_event(probe_subsys_event);
+       tracepoint_synchronize_unregister();
 }
 
 module_exit(tp_sample_trace_exit);
index 4ae4b7fcc04327766145a0483d00720dadc2928a..00d169792a3e14847d19e2abef38e4a7c243645b 100644 (file)
@@ -13,6 +13,9 @@
 #include <linux/proc_fs.h>
 #include "tp-samples-trace.h"
 
+DEFINE_TRACE(subsys_event);
+DEFINE_TRACE(subsys_eventb);
+
 struct proc_dir_entry *pentry_example;
 
 static int my_open(struct inode *inode, struct file *file)
index 468fbc9016c7b0773db9073a28f655bdf31e372d..7a176773af85a9fee23db5ff6e23edb783c74009 100644 (file)
@@ -198,16 +198,10 @@ cmd_modversions =                                                 \
        fi;
 endif
 
-ifdef CONFIG_64BIT
-arch_bits = 64
-else
-arch_bits = 32
-endif
-
 ifdef CONFIG_FTRACE_MCOUNT_RECORD
-cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl \
-       "$(ARCH)" "$(arch_bits)" "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" \
-       "$(NM)" "$(RM)" "$(MV)" "$(@)";
+cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
+       "$(if $(CONFIG_64BIT),64,32)" \
+       "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)";
 endif
 
 define rule_cc_o_c
index d2c61efc216f4bd413dd271faa3aa62c736c6203..f0af9aa9b243bcb1892ede765a44b2b225921046 100644 (file)
@@ -78,11 +78,13 @@ while (<>) {
 }
 
 if ($count == 0) {
-       print "No data found in the dmesg. Make sure that 'printk.time=1' and\n";
-       print "'initcall_debug' are passed on the kernel command line.\n\n";
-       print "Usage: \n";
-       print "      dmesg | perl scripts/bootgraph.pl > output.svg\n\n";
-       exit;
+    print STDERR <<END;
+No data found in the dmesg. Make sure that 'printk.time=1' and
+'initcall_debug' are passed on the kernel command line.
+Usage:
+      dmesg | perl scripts/bootgraph.pl > output.svg
+END
+    exit 1;
 }
 
 print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
@@ -109,8 +111,8 @@ my $stylecounter = 0;
 my %rows;
 my $rowscount = 1;
 my @initcalls = sort { $start{$a} <=> $start{$b} } keys(%start);
-my $key;
-foreach $key (@initcalls) {
+
+foreach my $key (@initcalls) {
        my $duration = $end{$key} - $start{$key};
 
        if ($duration >= $threshold) {
index 6b9fe3eb836027bff637b912798b3b024a4baeba..0b1dc9f9bb0682a65d9158f8fb44012ea05464b1 100755 (executable)
@@ -112,6 +112,8 @@ my ($arch, $bits, $objdump, $objcopy, $cc,
 # Acceptable sections to record.
 my %text_sections = (
      ".text" => 1,
+     ".sched.text" => 1,
+     ".spinlock.text" => 1,
 );
 
 $objdump = "objdump" if ((length $objdump) == 0);
@@ -130,10 +132,13 @@ my %weak;         # List of weak functions
 my %convert;           # List of local functions used that needs conversion
 
 my $type;
+my $nm_regex;          # Find the local functions (return function)
 my $section_regex;     # Find the start of a section
 my $function_regex;    # Find the name of a function
                        #    (return offset and func name)
 my $mcount_regex;      # Find the call site to mcount (return offset)
+my $alignment;         # The .align value to use for $mcount_section
+my $section_type;      # Section header plus possible alignment command
 
 if ($arch eq "x86") {
     if ($bits == 64) {
@@ -143,11 +148,21 @@ if ($arch eq "x86") {
     }
 }
 
+#
+# We base the defaults off of i386, the other archs may
+# feel free to change them in the below if statements.
+#
+$nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)";
+$section_regex = "Disassembly of section\\s+(\\S+):";
+$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
+$mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$";
+$section_type = '@progbits';
+$type = ".long";
+
 if ($arch eq "x86_64") {
-    $section_regex = "Disassembly of section\\s+(\\S+):";
-    $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$";
     $type = ".quad";
+    $alignment = 8;
 
     # force flags for this arch
     $ld .= " -m elf_x86_64";
@@ -156,10 +171,7 @@ if ($arch eq "x86_64") {
     $cc .= " -m64";
 
 } elsif ($arch eq "i386") {
-    $section_regex = "Disassembly of section\\s+(\\S+):";
-    $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
-    $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$";
-    $type = ".long";
+    $alignment = 4;
 
     # force flags for this arch
     $ld .= " -m elf_i386";
@@ -167,6 +179,27 @@ if ($arch eq "x86_64") {
     $objcopy .= " -O elf32-i386";
     $cc .= " -m32";
 
+} elsif ($arch eq "sh") {
+    $alignment = 2;
+
+    # force flags for this arch
+    $ld .= " -m shlelf_linux";
+    $objcopy .= " -O elf32-sh-linux";
+    $cc .= " -m32";
+
+} elsif ($arch eq "powerpc") {
+    $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)";
+    $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:";
+    $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$";
+
+    if ($bits == 64) {
+       $type = ".quad";
+    }
+
+} elsif ($arch eq "arm") {
+    $alignment = 2;
+    $section_type = '%progbits';
+
 } else {
     die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
 }
@@ -236,7 +269,7 @@ if (!$found_version) {
 #
 open (IN, "$nm $inputfile|") || die "error running $nm";
 while (<IN>) {
-    if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) {
+    if (/$nm_regex/) {
        $locals{$1} = 1;
     } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) {
        $weak{$2} = $1;
@@ -287,7 +320,8 @@ sub update_funcs
        if (!$opened) {
            open(FILE, ">$mcount_s") || die "can't create $mcount_s\n";
            $opened = 1;
-           print FILE "\t.section $mcount_section,\"a\",\@progbits\n";
+           print FILE "\t.section $mcount_section,\"a\",$section_type\n";
+           print FILE "\t.align $alignment\n" if (defined($alignment));
        }
        printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset;
     }
diff --git a/scripts/trace/power.pl b/scripts/trace/power.pl
new file mode 100644 (file)
index 0000000..4f729b3
--- /dev/null
@@ -0,0 +1,108 @@
+#!/usr/bin/perl
+
+# Copyright 2008, Intel Corporation
+#
+# This file is part of the Linux kernel
+#
+# This program file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program in a file named COPYING; if not, write to the
+# Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor,
+# Boston, MA 02110-1301 USA
+#
+# Authors:
+#      Arjan van de Ven <arjan@linux.intel.com>
+
+
+#
+# This script turns a cstate ftrace output into a SVG graphic that shows
+# historic C-state information
+#
+#
+#      cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg
+#
+
+my @styles;
+my $base = 0;
+
+my @pstate_last;
+my @pstate_level;
+
+$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+
+
+print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
+print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n";
+
+my $scale = 30000.0;
+while (<>) {
+       my $line = $_;
+       if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) {
+               if ($base == 0) {
+                       $base = $1;
+               }
+               my $time = $1 - $base;
+               $time = $time * $scale;
+               my $C = $2;
+               my $cpu = $3;
+               my $y = 400 * $cpu;
+               my $duration = $4 * $scale;
+               my $msec = int($4 * 100000)/100.0;
+               my $height = $C * 20;
+               $style = $styles[$C];
+
+               $y = $y + 140 - $height;
+
+               $x2 = $time + 4;
+               $y2 = $y + 4;
+
+
+               print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n";
+               print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n";
+       }
+       if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) {
+               my $time = $1 - $base;
+               my $state = $2;
+               my $cpu = $3;
+
+               if (defined($pstate_last[$cpu])) {
+                       my $from = $pstate_last[$cpu];
+                       my $oldstate = $pstate_state[$cpu];
+                       my $duration = ($time-$from) * $scale;
+
+                       $from = $from * $scale;
+                       my $to = $from + $duration;
+                       my $height = 140 - ($oldstate * (140/8));
+
+                       my $y = 400 * $cpu + 200 + $height;
+                       my $y2 = $y+4;
+                       my $style = $styles[8];
+
+                       print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n";
+                       print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n";
+               };
+
+               $pstate_last[$cpu] = $time;
+               $pstate_state[$cpu] = $state;
+       }
+}
+
+
+print "</svg>\n";
diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py
new file mode 100644 (file)
index 0000000..902f9a9
--- /dev/null
@@ -0,0 +1,130 @@
+#!/usr/bin/python
+
+"""
+Copyright 2008 (c) Frederic Weisbecker <fweisbec@gmail.com>
+Licensed under the terms of the GNU GPL License version 2
+
+This script parses a trace provided by the function tracer in
+kernel/trace/trace_functions.c
+The resulted trace is processed into a tree to produce a more human
+view of the call stack by drawing textual but hierarchical tree of
+calls. Only the functions's names and the the call time are provided.
+
+Usage:
+       Be sure that you have CONFIG_FUNCTION_TRACER
+       # mkdir /debugfs
+       # mount -t debug debug /debug
+       # echo function > /debug/tracing/current_tracer
+       $ cat /debug/tracing/trace_pipe > ~/raw_trace_func
+       Wait some times but not too much, the script is a bit slow.
+       Break the pipe (Ctrl + Z)
+       $ scripts/draw_functrace.py < raw_trace_func > draw_functrace
+       Then you have your drawn trace in draw_functrace
+"""
+
+
+import sys, re
+
+class CallTree:
+       """ This class provides a tree representation of the functions
+               call stack. If a function has no parent in the kernel (interrupt,
+               syscall, kernel thread...) then it is attached to a virtual parent
+               called ROOT.
+       """
+       ROOT = None
+
+       def __init__(self, func, time = None, parent = None):
+               self._func = func
+               self._time = time
+               if parent is None:
+                       self._parent = CallTree.ROOT
+               else:
+                       self._parent = parent
+               self._children = []
+
+       def calls(self, func, calltime):
+               """ If a function calls another one, call this method to insert it
+                       into the tree at the appropriate place.
+                       @return: A reference to the newly created child node.
+               """
+               child = CallTree(func, calltime, self)
+               self._children.append(child)
+               return child
+
+       def getParent(self, func):
+               """ Retrieve the last parent of the current node that
+                       has the name given by func. If this function is not
+                       on a parent, then create it as new child of root
+                       @return: A reference to the parent.
+               """
+               tree = self
+               while tree != CallTree.ROOT and tree._func != func:
+                       tree = tree._parent
+               if tree == CallTree.ROOT:
+                       child = CallTree.ROOT.calls(func, None)
+                       return child
+               return tree
+
+       def __repr__(self):
+               return self.__toString("", True)
+
+       def __toString(self, branch, lastChild):
+               if self._time is not None:
+                       s = "%s----%s (%s)\n" % (branch, self._func, self._time)
+               else:
+                       s = "%s----%s\n" % (branch, self._func)
+
+               i = 0
+               if lastChild:
+                       branch = branch[:-1] + " "
+               while i < len(self._children):
+                       if i != len(self._children) - 1:
+                               s += "%s" % self._children[i].__toString(branch +\
+                                                               "    |", False)
+                       else:
+                               s += "%s" % self._children[i].__toString(branch +\
+                                                               "    |", True)
+                       i += 1
+               return s
+
+class BrokenLineException(Exception):
+       """If the last line is not complete because of the pipe breakage,
+          we want to stop the processing and ignore this line.
+       """
+       pass
+
+class CommentLineException(Exception):
+       """ If the line is a comment (as in the beginning of the trace file),
+           just ignore it.
+       """
+       pass
+
+
+def parseLine(line):
+       line = line.strip()
+       if line.startswith("#"):
+               raise CommentLineException
+       m = re.match("[^]]+?\\] +([0-9.]+): (\\w+) <-(\\w+)", line)
+       if m is None:
+               raise BrokenLineException
+       return (m.group(1), m.group(2), m.group(3))
+
+
+def main():
+       CallTree.ROOT = CallTree("Root (Nowhere)", None, None)
+       tree = CallTree.ROOT
+
+       for line in sys.stdin:
+               try:
+                       calltime, callee, caller = parseLine(line)
+               except BrokenLineException:
+                       break
+               except CommentLineException:
+                       continue
+               tree = tree.getParent(caller)
+               tree = tree.calls(callee, calltime)
+
+       print CallTree.ROOT
+
+if __name__ == "__main__":
+       main()