* 'cpus4096-for-linus-2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (66 commits)
x86: export vector_used_by_percpu_irq
x86: use logical apicid in x2apic_cluster's x2apic_cpu_mask_to_apicid_and()
sched: nominate preferred wakeup cpu, fix
x86: fix lguest used_vectors breakage, -v2
x86: fix warning in arch/x86/kernel/io_apic.c
sched: fix warning in kernel/sched.c
sched: move test_sd_parent() to an SMP section of sched.h
sched: add SD_BALANCE_NEWIDLE at MC and CPU level for sched_mc>0
sched: activate active load balancing in new idle cpus
sched: bias task wakeups to preferred semi-idle packages
sched: nominate preferred wakeup cpu
sched: favour lower logical cpu number for sched_mc balance
sched: framework for sched_mc/smt_power_savings=N
sched: convert BALANCE_FOR_xx_POWER to inline functions
x86: use possible_cpus=NUM to extend the possible cpus allowed
x86: fix cpu_mask_to_apicid_and to include cpu_online_mask
x86: update io_apic.c to the new cpumask code
x86: Introduce topology_core_cpumask()/topology_thread_cpumask()
x86: xen: use smp_call_function_many()
x86: use work_on_cpu in x86/kernel/cpu/mcheck/mce_amd_64.c
...
Fixed up trivial conflict in kernel/time/tick-sched.c manually
#include <asm/tlbflush.h>
#include <asm/ptrace.h>
- /*
- * bitmask of present and online CPUs.
- * The present bitmask indicates that the CPU is physically present.
- * The online bitmask indicates that the CPU is up and running.
- */
- cpumask_t cpu_possible_map;
- EXPORT_SYMBOL(cpu_possible_map);
- cpumask_t cpu_online_map;
- EXPORT_SYMBOL(cpu_online_map);
-
/*
* as from 2.5, kernels no longer have an init_tasks structure
* so we need some other way of telling a new secondary core
/*
* Stop the local timer for this CPU.
*/
- local_timer_stop(cpu);
+ local_timer_stop();
/*
* Flush user cache and TLB mappings, and then remove this CPU
/*
* Setup local timer for this CPU.
*/
- local_timer_setup(cpu);
+ local_timer_setup();
calibrate_delay();
/* Use "raw" primitives so we behave correctly on RT kernels. */
raw_local_irq_save(flags);
+ /*
+ * According to Thomas Gleixner irqs are already disabled here. Simply
+ * removing raw_local_irq_save above (and the matching
+ * raw_local_irq_restore) was not accepted. See
+ * http://thread.gmane.org/gmane.linux.ports.arm.kernel/41174
+ * So for now (2008-11-20) just warn once if irqs were not disabled ...
+ */
+ WARN_ON_ONCE(!raw_irqs_disabled_flags(flags));
+
/* The alarm IRQ uses absolute time (now+delta), not the relative
* time (delta) in our calling convention. Like all clockevents
* using such "match" hardware, we have a race to defend against.
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.shift = 32,
.rating = 150,
- .cpumask = CPU_MASK_CPU0,
.set_next_event = clkevt32k_next_event,
.set_mode = clkevt32k_mode,
};
clkevt.mult = div_sc(AT91_SLOW_CLOCK, NSEC_PER_SEC, clkevt.shift);
clkevt.max_delta_ns = clockevent_delta2ns(AT91_ST_ALMV, &clkevt);
clkevt.min_delta_ns = clockevent_delta2ns(2, &clkevt) + 1;
- clkevt.cpumask = cpumask_of_cpu(0);
+ clkevt.cpumask = cpumask_of(0);
clockevents_register_device(&clkevt);
/* register clocksource */
#include <asm/div64.h>
#include <asm/mach/irq.h>
#include <asm/mach/time.h>
+#include <mach/hardware.h>
#include <mach/pxa-regs.h>
-#include <asm/mach-types.h>
/*
* This is PXA's sched_clock implementation. This has a resolution
.features = CLOCK_EVT_FEAT_ONESHOT,
.shift = 32,
.rating = 200,
- .cpumask = CPU_MASK_CPU0,
.set_next_event = pxa_osmr0_set_next_event,
.set_mode = pxa_osmr0_set_mode,
};
static void __init pxa_timer_init(void)
{
- unsigned long clock_tick_rate;
+ unsigned long clock_tick_rate = get_clock_tick_rate();
OIER = 0;
OSSR = OSSR_M0 | OSSR_M1 | OSSR_M2 | OSSR_M3;
- if (cpu_is_pxa25x())
- clock_tick_rate = 3686400;
- else if (machine_is_mainstone())
- clock_tick_rate = 3249600;
- else
- clock_tick_rate = 3250000;
-
set_oscr2ns_scale(clock_tick_rate);
ckevt_pxa_osmr0.mult =
clockevent_delta2ns(0x7fffffff, &ckevt_pxa_osmr0);
ckevt_pxa_osmr0.min_delta_ns =
clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_pxa_osmr0) + 1;
+ ckevt_pxa_osmr0.cpumask = cpumask_of(0);
cksrc_pxa_oscr0.mult =
clocksource_hz2mult(clock_tick_rate, cksrc_pxa_oscr0.shift);
#include <linux/clocksource.h>
#include <linux/clockchips.h>
#include <linux/io.h>
+#include <linux/smc911x.h>
+#include <asm/clkdev.h>
#include <asm/system.h>
#include <mach/hardware.h>
#include <asm/irq.h>
#include <asm/leds.h>
+#include <asm/mach-types.h>
#include <asm/hardware/arm_timer.h>
#include <asm/hardware/icst307.h>
#define REALVIEW_REFCOUNTER (__io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_24MHz_OFFSET)
-/* used by entry-macro.S */
+/* used by entry-macro.S and platsmp.c */
void __iomem *gic_cpu_base_addr;
/*
return platform_device_register(&realview_flash_device);
}
+static struct smc911x_platdata realview_smc911x_platdata = {
+ .flags = SMC911X_USE_32BIT,
+ .irq_flags = IRQF_SHARED,
+ .irq_polarity = 1,
+};
+
+static struct platform_device realview_eth_device = {
+ .name = "smc911x",
+ .id = 0,
+ .num_resources = 2,
+};
+
+int realview_eth_register(const char *name, struct resource *res)
+{
+ if (name)
+ realview_eth_device.name = name;
+ realview_eth_device.resource = res;
+ if (strcmp(realview_eth_device.name, "smc911x") == 0)
+ realview_eth_device.dev.platform_data = &realview_smc911x_platdata;
+
+ return platform_device_register(&realview_eth_device);
+}
+
static struct resource realview_i2c_resource = {
.start = REALVIEW_I2C_BASE,
.end = REALVIEW_I2C_BASE + SZ_4K - 1,
static void realview_oscvco_set(struct clk *clk, struct icst307_vco vco)
{
void __iomem *sys_lock = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_LOCK_OFFSET;
- void __iomem *sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC4_OFFSET;
+ void __iomem *sys_osc;
u32 val;
+ if (machine_is_realview_pb1176())
+ sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC0_OFFSET;
+ else
+ sys_osc = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_OSC4_OFFSET;
+
val = readl(sys_osc) & ~0x7ffff;
val |= vco.v | (vco.r << 9) | (vco.s << 16);
writel(0, sys_lock);
}
-struct clk realview_clcd_clk = {
- .name = "CLCDCLK",
+static struct clk oscvco_clk = {
.params = &realview_oscvco_params,
.setvco = realview_oscvco_set,
};
+/*
+ * These are fixed clocks.
+ */
+static struct clk ref24_clk = {
+ .rate = 24000000,
+};
+
+static struct clk_lookup lookups[] = {
+ { /* UART0 */
+ .dev_id = "dev:f1",
+ .clk = &ref24_clk,
+ }, { /* UART1 */
+ .dev_id = "dev:f2",
+ .clk = &ref24_clk,
+ }, { /* UART2 */
+ .dev_id = "dev:f3",
+ .clk = &ref24_clk,
+ }, { /* UART3 */
+ .dev_id = "fpga:09",
+ .clk = &ref24_clk,
+ }, { /* KMI0 */
+ .dev_id = "fpga:06",
+ .clk = &ref24_clk,
+ }, { /* KMI1 */
+ .dev_id = "fpga:07",
+ .clk = &ref24_clk,
+ }, { /* MMC0 */
+ .dev_id = "fpga:05",
+ .clk = &ref24_clk,
+ }, { /* EB:CLCD */
+ .dev_id = "dev:20",
+ .clk = &oscvco_clk,
+ }, { /* PB:CLCD */
+ .dev_id = "issp:20",
+ .clk = &oscvco_clk,
+ }
+};
+
+static int __init clk_init(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(lookups); i++)
+ clkdev_add(&lookups[i]);
+ return 0;
+}
+arch_initcall(clk_init);
+
/*
* CLCD support.
*/
.width = -1,
.height = -1,
.tim2 = TIM2_BCD | TIM2_IPC,
- .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1),
+ .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
+ .bpp = 16,
+};
+
+static struct clcd_panel xvga = {
+ .mode = {
+ .name = "XVGA",
+ .refresh = 60,
+ .xres = 1024,
+ .yres = 768,
+ .pixclock = 15748,
+ .left_margin = 152,
+ .right_margin = 48,
+ .upper_margin = 23,
+ .lower_margin = 3,
+ .hsync_len = 104,
+ .vsync_len = 4,
+ .sync = 0,
+ .vmode = FB_VMODE_NONINTERLACED,
+ },
+ .width = -1,
+ .height = -1,
+ .tim2 = TIM2_BCD | TIM2_IPC,
+ .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
.bpp = 16,
};
.width = -1,
.height = -1,
.tim2 = TIM2_BCD,
- .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1),
+ .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
.bpp = 16,
};
.width = -1,
.height = -1,
.tim2 = TIM2_IVS | TIM2_IHS | TIM2_IPC,
- .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1),
+ .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
.bpp = 16,
};
.width = -1,
.height = -1,
.tim2 = TIM2_BCD | TIM2_IPC,
- .cntl = CNTL_LCDTFT | CNTL_LCDVCOMP(1),
+ .cntl = CNTL_LCDTFT | CNTL_BGR | CNTL_LCDVCOMP(1),
.bpp = 16,
};
static struct clcd_panel *realview_clcd_panel(void)
{
void __iomem *sys_clcd = __io_address(REALVIEW_SYS_BASE) + REALVIEW_SYS_CLCD_OFFSET;
- struct clcd_panel *panel = &vga;
+ struct clcd_panel *vga_panel;
+ struct clcd_panel *panel;
u32 val;
+ if (machine_is_realview_eb())
+ vga_panel = &vga;
+ else
+ vga_panel = &xvga;
+
val = readl(sys_clcd) & SYS_CLCD_ID_MASK;
if (val == SYS_CLCD_ID_SANYO_3_8)
panel = &sanyo_3_8_in;
else if (val == SYS_CLCD_ID_EPSON_2_2)
panel = &epson_2_2_in;
else if (val == SYS_CLCD_ID_VGA)
- panel = &vga;
+ panel = vga_panel;
else {
printk(KERN_ERR "CLCD: unknown LCD panel ID 0x%08x, using VGA\n",
val);
- panel = &vga;
+ panel = vga_panel;
}
return panel;
writel(val, sys_clcd);
}
-static unsigned long framesize = SZ_1M;
-
static int realview_clcd_setup(struct clcd_fb *fb)
{
+ unsigned long framesize;
dma_addr_t dma;
+ if (machine_is_realview_eb())
+ /* VGA, 16bpp */
+ framesize = 640 * 480 * 2;
+ else
+ /* XVGA, 16bpp */
+ framesize = 1024 * 768 * 2;
+
fb->panel = realview_clcd_panel();
fb->fb.screen_base = dma_alloc_writecombine(&fb->dev->dev, framesize,
.set_mode = timer_set_mode,
.set_next_event = timer_set_next_event,
.rating = 300,
- .cpumask = CPU_MASK_ALL,
+ .cpumask = cpu_all_mask,
};
static void __init realview_clockevents_init(unsigned int timer_irq)
* The dummy clock device has to be registered before the main device
* so that the latter will broadcast the clock events
*/
- local_timer_setup(smp_processor_id());
+ local_timer_setup();
#endif
/*
#ifdef CONFIG_LOCAL_TIMERS
-#define TWD_BASE(cpu) (twd_base_addr + (cpu) * twd_size)
-
/* set up by the platform code */
-void __iomem *twd_base_addr;
-unsigned int twd_size;
+void __iomem *twd_base;
static unsigned long mpcore_timer_rate;
static void local_timer_set_mode(enum clock_event_mode mode,
struct clock_event_device *clk)
{
- void __iomem *base = TWD_BASE(smp_processor_id());
unsigned long ctrl;
switch(mode) {
ctrl = 0;
}
- __raw_writel(ctrl, base + TWD_TIMER_CONTROL);
+ __raw_writel(ctrl, twd_base + TWD_TIMER_CONTROL);
}
static int local_timer_set_next_event(unsigned long evt,
struct clock_event_device *unused)
{
- void __iomem *base = TWD_BASE(smp_processor_id());
- unsigned long ctrl = __raw_readl(base + TWD_TIMER_CONTROL);
+ unsigned long ctrl = __raw_readl(twd_base + TWD_TIMER_CONTROL);
- __raw_writel(evt, base + TWD_TIMER_COUNTER);
- __raw_writel(ctrl | TWD_TIMER_CONTROL_ENABLE, base + TWD_TIMER_CONTROL);
+ __raw_writel(evt, twd_base + TWD_TIMER_COUNTER);
+ __raw_writel(ctrl | TWD_TIMER_CONTROL_ENABLE, twd_base + TWD_TIMER_CONTROL);
return 0;
}
*/
int local_timer_ack(void)
{
- void __iomem *base = TWD_BASE(smp_processor_id());
-
- if (__raw_readl(base + TWD_TIMER_INTSTAT)) {
- __raw_writel(1, base + TWD_TIMER_INTSTAT);
+ if (__raw_readl(twd_base + TWD_TIMER_INTSTAT)) {
+ __raw_writel(1, twd_base + TWD_TIMER_INTSTAT);
return 1;
}
return 0;
}
-static void __cpuinit twd_calibrate_rate(unsigned int cpu)
+static void __cpuinit twd_calibrate_rate(void)
{
- void __iomem *base = TWD_BASE(cpu);
unsigned long load, count;
u64 waitjiffies;
waitjiffies += 5;
/* enable, no interrupt or reload */
- __raw_writel(0x1, base + TWD_TIMER_CONTROL);
+ __raw_writel(0x1, twd_base + TWD_TIMER_CONTROL);
/* maximum value */
- __raw_writel(0xFFFFFFFFU, base + TWD_TIMER_COUNTER);
+ __raw_writel(0xFFFFFFFFU, twd_base + TWD_TIMER_COUNTER);
while (get_jiffies_64() < waitjiffies)
udelay(10);
- count = __raw_readl(base + TWD_TIMER_COUNTER);
+ count = __raw_readl(twd_base + TWD_TIMER_COUNTER);
mpcore_timer_rate = (0xFFFFFFFFU - count) * (HZ / 5);
load = mpcore_timer_rate / HZ;
- __raw_writel(load, base + TWD_TIMER_LOAD);
+ __raw_writel(load, twd_base + TWD_TIMER_LOAD);
}
/*
* Setup the local clock events for a CPU.
*/
-void __cpuinit local_timer_setup(unsigned int cpu)
+void __cpuinit local_timer_setup(void)
{
+ unsigned int cpu = smp_processor_id();
struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
unsigned long flags;
- twd_calibrate_rate(cpu);
+ twd_calibrate_rate();
clk->name = "local_timer";
clk->features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT;
clk->set_mode = local_timer_set_mode;
clk->set_next_event = local_timer_set_next_event;
clk->irq = IRQ_LOCALTIMER;
- clk->cpumask = cpumask_of_cpu(cpu);
+ clk->cpumask = cpumask_of(cpu);
clk->shift = 20;
clk->mult = div_sc(mpcore_timer_rate, NSEC_PER_SEC, clk->shift);
clk->max_delta_ns = clockevent_delta2ns(0xffffffff, clk);
/*
* take a local timer down
*/
-void __cpuexit local_timer_stop(unsigned int cpu)
+void __cpuexit local_timer_stop(void)
{
- __raw_writel(0, TWD_BASE(cpu) + TWD_TIMER_CONTROL);
+ __raw_writel(0, twd_base + TWD_TIMER_CONTROL);
}
#else /* CONFIG_LOCAL_TIMERS */
{
}
-void __cpuinit local_timer_setup(unsigned int cpu)
+void __cpuinit local_timer_setup(void)
{
+ unsigned int cpu = smp_processor_id();
struct clock_event_device *clk = &per_cpu(local_clockevent, cpu);
clk->name = "dummy_timer";
clk->rating = 200;
clk->set_mode = dummy_timer_set_mode;
clk->broadcast = smp_timer_broadcast;
- clk->cpumask = cpumask_of_cpu(cpu);
+ clk->cpumask = cpumask_of(cpu);
clockevents_register_device(clk);
}
* linux/arch/arm/mach-sa1100/time.c
*
* Copyright (C) 1998 Deborah Wallach.
- * Twiddles (C) 1999 Hugo Fiennes <hugo@empeg.com>
- *
+ * Twiddles (C) 1999 Hugo Fiennes <hugo@empeg.com>
+ *
* 2000/03/29 (C) Nicolas Pitre <nico@cam.org>
* Rewritten: big cleanup, much simpler, better HZ accuracy.
*
.features = CLOCK_EVT_FEAT_ONESHOT,
.shift = 32,
.rating = 200,
- .cpumask = CPU_MASK_CPU0,
.set_next_event = sa1100_osmr0_set_next_event,
.set_mode = sa1100_osmr0_set_mode,
};
clockevent_delta2ns(0x7fffffff, &ckevt_sa1100_osmr0);
ckevt_sa1100_osmr0.min_delta_ns =
clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_sa1100_osmr0) + 1;
+ ckevt_sa1100_osmr0.cpumask = cpumask_of(0);
cksrc_sa1100_oscr.mult =
clocksource_hz2mult(CLOCK_TICK_RATE, cksrc_sa1100_oscr.shift);
#include <linux/cnt32_to_63.h>
#include <linux/io.h>
+#include <asm/clkdev.h>
#include <asm/system.h>
#include <mach/hardware.h>
#include <asm/irq.h>
static void versatile_oscvco_set(struct clk *clk, struct icst307_vco vco)
{
- void __iomem *sys_lock = __io_address(VERSATILE_SYS_BASE) + VERSATILE_SYS_LOCK_OFFSET;
- void __iomem *sys_osc = __io_address(VERSATILE_SYS_BASE) + VERSATILE_SYS_OSCCLCD_OFFSET;
+ void __iomem *sys = __io_address(VERSATILE_SYS_BASE);
+ void __iomem *sys_lock = sys + VERSATILE_SYS_LOCK_OFFSET;
u32 val;
- val = readl(sys_osc) & ~0x7ffff;
+ val = readl(sys + clk->oscoff) & ~0x7ffff;
val |= vco.v | (vco.r << 9) | (vco.s << 16);
writel(0xa05f, sys_lock);
- writel(val, sys_osc);
+ writel(val, sys + clk->oscoff);
writel(0, sys_lock);
}
-static struct clk versatile_clcd_clk = {
- .name = "CLCDCLK",
+static struct clk osc4_clk = {
.params = &versatile_oscvco_params,
- .setvco = versatile_oscvco_set,
+ .oscoff = VERSATILE_SYS_OSCCLCD_OFFSET,
+ .setvco = versatile_oscvco_set,
+};
+
+/*
+ * These are fixed clocks.
+ */
+static struct clk ref24_clk = {
+ .rate = 24000000,
+};
+
+static struct clk_lookup lookups[] __initdata = {
+ { /* UART0 */
+ .dev_id = "dev:f1",
+ .clk = &ref24_clk,
+ }, { /* UART1 */
+ .dev_id = "dev:f2",
+ .clk = &ref24_clk,
+ }, { /* UART2 */
+ .dev_id = "dev:f3",
+ .clk = &ref24_clk,
+ }, { /* UART3 */
+ .dev_id = "fpga:09",
+ .clk = &ref24_clk,
+ }, { /* KMI0 */
+ .dev_id = "fpga:06",
+ .clk = &ref24_clk,
+ }, { /* KMI1 */
+ .dev_id = "fpga:07",
+ .clk = &ref24_clk,
+ }, { /* MMC0 */
+ .dev_id = "fpga:05",
+ .clk = &ref24_clk,
+ }, { /* MMC1 */
+ .dev_id = "fpga:0b",
+ .clk = &ref24_clk,
+ }, { /* CLCD */
+ .dev_id = "dev:20",
+ .clk = &osc4_clk,
+ }
};
/*
{
int i;
- clk_register(&versatile_clcd_clk);
+ for (i = 0; i < ARRAY_SIZE(lookups); i++)
+ clkdev_add(&lookups[i]);
platform_device_register(&versatile_flash_device);
platform_device_register(&versatile_i2c_device);
timer0_clockevent.min_delta_ns =
clockevent_delta2ns(0xf, &timer0_clockevent);
- timer0_clockevent.cpumask = cpumask_of_cpu(0);
+ timer0_clockevent.cpumask = cpumask_of(0);
clockevents_register_device(&timer0_clockevent);
}
#define DBG(fmt...)
#endif
-int smp_hw_index[NR_CPUS];
struct thread_info *secondary_ti;
- cpumask_t cpu_possible_map = CPU_MASK_NONE;
- cpumask_t cpu_online_map = CPU_MASK_NONE;
DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
DEFINE_PER_CPU(cpumask_t, cpu_core_map) = CPU_MASK_NONE;
- EXPORT_SYMBOL(cpu_online_map);
- EXPORT_SYMBOL(cpu_possible_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
}
}
+static irqreturn_t call_function_action(int irq, void *data)
+{
+ generic_smp_call_function_interrupt();
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t reschedule_action(int irq, void *data)
+{
+ /* we just need the return path side effect of checking need_resched */
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t call_function_single_action(int irq, void *data)
+{
+ generic_smp_call_function_single_interrupt();
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t debug_ipi_action(int irq, void *data)
+{
+ smp_message_recv(PPC_MSG_DEBUGGER_BREAK);
+ return IRQ_HANDLED;
+}
+
+static irq_handler_t smp_ipi_action[] = {
+ [PPC_MSG_CALL_FUNCTION] = call_function_action,
+ [PPC_MSG_RESCHEDULE] = reschedule_action,
+ [PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
+ [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
+};
+
+const char *smp_ipi_name[] = {
+ [PPC_MSG_CALL_FUNCTION] = "ipi call function",
+ [PPC_MSG_RESCHEDULE] = "ipi reschedule",
+ [PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
+ [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
+};
+
+/* optional function to request ipi, for controllers with >= 4 ipis */
+int smp_request_message_ipi(int virq, int msg)
+{
+ int err;
+
+ if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) {
+ return -EINVAL;
+ }
+#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC)
+ if (msg == PPC_MSG_DEBUGGER_BREAK) {
+ return 1;
+ }
+#endif
+ err = request_irq(virq, smp_ipi_action[msg], IRQF_DISABLED|IRQF_PERCPU,
+ smp_ipi_name[msg], 0);
+ WARN(err < 0, "unable to request_irq %d for %s (rc %d)\n",
+ virq, smp_ipi_name[msg], err);
+
+ return err;
+}
+
void smp_send_reschedule(int cpu)
{
if (likely(smp_ops))
static struct device_node *cpu_to_l2cache(int cpu)
{
struct device_node *np;
- const phandle *php;
- phandle ph;
+ struct device_node *cache;
if (!cpu_present(cpu))
return NULL;
if (np == NULL)
return NULL;
- php = of_get_property(np, "l2-cache", NULL);
- if (php == NULL)
- return NULL;
- ph = *php;
+ cache = of_find_next_cache_node(np);
+
of_node_put(np);
- return of_find_node_by_phandle(ph);
+ return cache;
}
/* Activate a secondary processor. */
static unsigned tb_to_ns_shift __read_mostly;
static unsigned long boot_tb __read_mostly;
-static struct gettimeofday_struct do_gtod;
-
extern struct timezone sys_tz;
static long timezone_offset;
}
EXPORT_SYMBOL(udelay);
-
-/*
- * There are two copies of tb_to_xs and stamp_xsec so that no
- * lock is needed to access and use these values in
- * do_gettimeofday. We alternate the copies and as long as a
- * reasonable time elapses between changes, there will never
- * be inconsistent values. ntpd has a minimum of one minute
- * between updates.
- */
static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec,
u64 new_tb_to_xs)
{
- unsigned temp_idx;
- struct gettimeofday_vars *temp_varp;
-
- temp_idx = (do_gtod.var_idx == 0);
- temp_varp = &do_gtod.vars[temp_idx];
-
- temp_varp->tb_to_xs = new_tb_to_xs;
- temp_varp->tb_orig_stamp = new_tb_stamp;
- temp_varp->stamp_xsec = new_stamp_xsec;
- smp_mb();
- do_gtod.varp = temp_varp;
- do_gtod.var_idx = temp_idx;
-
/*
* tb_update_count is used to allow the userspace gettimeofday code
* to assure itself that it sees a consistent view of the tb_to_xs and
vdso_data->tb_to_xs = new_tb_to_xs;
vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+ vdso_data->stamp_xtime = xtime;
smp_wmb();
++(vdso_data->tb_update_count);
}
tb_ticks_per_sec = new_tb_ticks_per_sec;
calc_cputime_factors();
div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres );
- do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
tb_to_xs = divres.result_low;
- do_gtod.varp->tb_to_xs = tb_to_xs;
vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
vdso_data->tb_to_xs = tb_to_xs;
}
struct clock_event_device *dec = &per_cpu(decrementers, cpu).event;
*dec = decrementer_clockevent;
- dec->cpumask = cpumask_of_cpu(cpu);
+ dec->cpumask = cpumask_of(cpu);
printk(KERN_DEBUG "clockevent: %s mult[%lx] shift[%d] cpu[%d]\n",
dec->name, dec->mult, dec->shift, cpu);
sys_tz.tz_dsttime = 0;
}
- do_gtod.varp = &do_gtod.vars[0];
- do_gtod.var_idx = 0;
- do_gtod.varp->tb_orig_stamp = tb_last_jiffy;
- __get_cpu_var(last_jiffy) = tb_last_jiffy;
- do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC;
- do_gtod.tb_ticks_per_sec = tb_ticks_per_sec;
- do_gtod.varp->tb_to_xs = tb_to_xs;
- do_gtod.tb_to_us = tb_to_us;
-
vdso_data->tb_orig_stamp = tb_last_jiffy;
vdso_data->tb_update_count = 0;
vdso_data->tb_ticks_per_sec = tb_ticks_per_sec;
lpar_xirr_info_set((0xff << 24) | irq);
}
- static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
+ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
{
unsigned int irq;
int status;
int i, j;
struct device_node *np;
u32 ilen;
- const u32 *ireg, *isize;
+ const u32 *ireg;
u32 hcpuid;
/* Find the server numbers for the boot cpu. */
}
}
- /* get the bit size of server numbers */
- isize = of_get_property(np, "ibm,interrupt-server#-size", NULL);
- if (isize)
- interrupt_server_size = *isize;
-
of_node_put(np);
}
struct device_node *np;
u32 indx = 0;
int found = 0;
+ const u32 *isize;
ppc64_boot_msg(0x20, "XICS Init");
if (found == 0)
return;
+ /* get the bit size of server numbers */
+ found = 0;
+
+ for_each_compatible_node(np, NULL, "ibm,ppc-xics") {
+ isize = of_get_property(np, "ibm,interrupt-server#-size", NULL);
+
+ if (!isize)
+ continue;
+
+ if (!found) {
+ interrupt_server_size = *isize;
+ found = 1;
+ } else if (*isize != interrupt_server_size) {
+ printk(KERN_WARNING "XICS: "
+ "mismatched ibm,interrupt-server#-size\n");
+ interrupt_server_size = max(*isize,
+ interrupt_server_size);
+ }
+ }
+
xics_update_irq_servers();
xics_init_host();
/* Have the calling processor join or leave the specified global queue */
static void xics_set_cpu_giq(unsigned int gserver, unsigned int join)
{
- int status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE,
- (1UL << interrupt_server_size) - 1 - gserver, join);
- WARN_ON(status < 0);
+ int index;
+ int status;
+
+ if (!rtas_indicator_present(GLOBAL_INTERRUPT_QUEUE, NULL))
+ return;
+
+ index = (1UL << interrupt_server_size) - 1 - gserver;
+
+ status = rtas_set_indicator_fast(GLOBAL_INTERRUPT_QUEUE, index, join);
+
+ WARN(status < 0, "set-indicator(%d, %d, %u) returned %d\n",
+ GLOBAL_INTERRUPT_QUEUE, index, join, status);
}
void xics_setup_cpu(void)
/* Reset affinity to all cpus */
irq_desc[virq].affinity = CPU_MASK_ALL;
- desc->chip->set_affinity(virq, CPU_MASK_ALL);
+ desc->chip->set_affinity(virq, cpu_all_mask);
unlock:
spin_unlock_irqrestore(&desc->lock, flags);
}
(void)mpic_cpu_read(MPIC_INFO(CPU_WHOAMI));
}
-#ifdef CONFIG_SMP
-static irqreturn_t mpic_ipi_action(int irq, void *data)
-{
- long ipi = (long)data;
-
- smp_message_recv(ipi);
-
- return IRQ_HANDLED;
-}
-#endif /* CONFIG_SMP */
-
/*
* Linux descriptor level callbacks
*/
#endif /* CONFIG_SMP */
- void mpic_set_affinity(unsigned int irq, cpumask_t cpumask)
+ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
struct mpic *mpic = mpic_from_irq(irq);
unsigned int src = mpic_irq_to_hw(irq);
} else {
cpumask_t tmp;
- cpus_and(tmp, cpumask, cpu_online_map);
+ cpumask_and(&tmp, cpumask, cpu_online_mask);
mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
mpic_physmask(cpus_addr(tmp)[0]));
void mpic_request_ipis(void)
{
struct mpic *mpic = mpic_primary;
- long i, err;
- static char *ipi_names[] = {
- "IPI0 (call function)",
- "IPI1 (reschedule)",
- "IPI2 (call function single)",
- "IPI3 (debugger break)",
- };
+ int i;
BUG_ON(mpic == NULL);
printk(KERN_INFO "mpic: requesting IPIs ... \n");
unsigned int vipi = irq_create_mapping(mpic->irqhost,
mpic->ipi_vecs[0] + i);
if (vipi == NO_IRQ) {
- printk(KERN_ERR "Failed to map IPI %ld\n", i);
- break;
- }
- err = request_irq(vipi, mpic_ipi_action,
- IRQF_DISABLED|IRQF_PERCPU,
- ipi_names[i], (void *)i);
- if (err) {
- printk(KERN_ERR "Request of irq %d for IPI %ld failed\n",
- vipi, i);
- break;
+ printk(KERN_ERR "Failed to map %s\n", smp_ipi_name[i]);
+ continue;
}
+ smp_request_message_ipi(vipi, i);
}
}
config GENERIC_TIME
def_bool y
+config GENERIC_TIME_VSYSCALL
+ def_bool y
+
config GENERIC_CLOCKEVENTS
def_bool y
bool
default y if KVM
+config VIRT_CPU_ACCOUNTING
+ def_bool y
+
mainmenu "Linux Kernel Configuration"
config S390
def_bool y
+ select USE_GENERIC_SMP_HELPERS if SMP
+ select HAVE_FUNCTION_TRACER
select HAVE_OPROFILE
select HAVE_KPROBES
select HAVE_KRETPROBES
select HAVE_KVM if 64BIT
select HAVE_ARCH_TRACEHOOK
+ select INIT_ALL_POSSIBLE
source "init/Kconfig"
Class (z9 BC). The kernel will be slightly faster but will not
work on older machines such as the z990, z890, z900, and z800.
+config MARCH_Z10
+ bool "IBM System z10"
+ help
+ Select this to enable optimizations for IBM System z10. The
+ kernel will be slightly faster but will not work on older
+ machines such as the z990, z890, z900, z800, z9-109, z9-ec
+ and z9-bc.
+
endchoice
config PACK_STACK
If unsure, say Y.
-config QDIO_DEBUG
- bool "Extended debugging information"
- depends on QDIO
- help
- Say Y here to get extended debugging output in
- /sys/kernel/debug/s390dbf/qdio...
- Warning: this option reduces the performance of the QDIO module.
-
- If unsure, say N.
-
config CHSC_SCH
tristate "Support for CHSC subchannels"
help
hypervisor. The ESSA instruction is used to do the states
changes between a page that has content and the unused state.
-config VIRT_TIMER
- bool "Virtual CPU timer support"
- help
- This provides a kernel interface for virtual CPU timers.
- Default is disabled.
-
-config VIRT_CPU_ACCOUNTING
- bool "Base user process accounting on virtual cpu timer"
- depends on VIRT_TIMER
- help
- Select this option to use CPU timer deltas to do user
- process accounting.
-
config APPLDATA_BASE
bool "Linux - VM Monitor Stream, base infrastructure"
- depends on PROC_FS && VIRT_TIMER=y
+ depends on PROC_FS
help
This provides a kernel interface for creating and updating z/VM APPLDATA
monitor records. The monitor records are updated at certain time
* cpu_number_map in other architectures.
*/
+#define KMSG_COMPONENT "cpu"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/mm.h>
struct _lowcore *lowcore_ptr[NR_CPUS];
EXPORT_SYMBOL(lowcore_ptr);
- cpumask_t cpu_online_map = CPU_MASK_NONE;
- EXPORT_SYMBOL(cpu_online_map);
-
- cpumask_t cpu_possible_map = CPU_MASK_ALL;
- EXPORT_SYMBOL(cpu_possible_map);
-
static struct task_struct *current_set[NR_CPUS];
static u8 smp_cpu_type;
static void smp_ext_bitcall(int, ec_bit_sig);
-/*
- * Structure and data for __smp_call_function_map(). This is designed to
- * minimise static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- cpumask_t started;
- cpumask_t finished;
- int wait;
-};
-
-static struct call_data_struct *call_data;
-
-/*
- * 'Call function' interrupt callback
- */
-static void do_call_function(void)
-{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
-
- cpu_set(smp_processor_id(), call_data->started);
- (*func)(info);
- if (wait)
- cpu_set(smp_processor_id(), call_data->finished);;
-}
-
-static void __smp_call_function_map(void (*func) (void *info), void *info,
- int wait, cpumask_t map)
-{
- struct call_data_struct data;
- int cpu, local = 0;
-
- /*
- * Can deadlock when interrupts are disabled or if in wrong context.
- */
- WARN_ON(irqs_disabled() || in_irq());
-
- /*
- * Check for local function call. We have to have the same call order
- * as in on_each_cpu() because of machine_restart_smp().
- */
- if (cpu_isset(smp_processor_id(), map)) {
- local = 1;
- cpu_clear(smp_processor_id(), map);
- }
-
- cpus_and(map, map, cpu_online_map);
- if (cpus_empty(map))
- goto out;
-
- data.func = func;
- data.info = info;
- data.started = CPU_MASK_NONE;
- data.wait = wait;
- if (wait)
- data.finished = CPU_MASK_NONE;
-
- call_data = &data;
-
- for_each_cpu_mask(cpu, map)
- smp_ext_bitcall(cpu, ec_call_function);
-
- /* Wait for response */
- while (!cpus_equal(map, data.started))
- cpu_relax();
- if (wait)
- while (!cpus_equal(map, data.finished))
- cpu_relax();
-out:
- if (local) {
- local_irq_disable();
- func(info);
- local_irq_enable();
- }
-}
-
-/*
- * smp_call_function:
- * @func: the function to run; this must be fast and non-blocking
- * @info: an arbitrary pointer to pass to the function
- * @wait: if true, wait (atomically) until function has completed on other CPUs
- *
- * Run a function on all other CPUs.
- *
- * You must not call this function with disabled interrupts, from a
- * hardware interrupt handler or from a bottom half.
- */
-int smp_call_function(void (*func) (void *info), void *info, int wait)
-{
- cpumask_t map;
-
- spin_lock(&call_lock);
- map = cpu_online_map;
- cpu_clear(smp_processor_id(), map);
- __smp_call_function_map(func, info, wait, map);
- spin_unlock(&call_lock);
- return 0;
-}
-EXPORT_SYMBOL(smp_call_function);
-
-/*
- * smp_call_function_single:
- * @cpu: the CPU where func should run
- * @func: the function to run; this must be fast and non-blocking
- * @info: an arbitrary pointer to pass to the function
- * @wait: if true, wait (atomically) until function has completed on other CPUs
- *
- * Run a function on one processor.
- *
- * You must not call this function with disabled interrupts, from a
- * hardware interrupt handler or from a bottom half.
- */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
- int wait)
-{
- spin_lock(&call_lock);
- __smp_call_function_map(func, info, wait, cpumask_of_cpu(cpu));
- spin_unlock(&call_lock);
- return 0;
-}
-EXPORT_SYMBOL(smp_call_function_single);
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on. Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
- int wait)
-{
- spin_lock(&call_lock);
- cpu_clear(smp_processor_id(), mask);
- __smp_call_function_map(func, info, wait, mask);
- spin_unlock(&call_lock);
- return 0;
-}
-EXPORT_SYMBOL(smp_call_function_mask);
-
void smp_send_stop(void)
{
int cpu, rc;
bits = xchg(&S390_lowcore.ext_call_fast, 0);
if (test_bit(ec_call_function, &bits))
- do_call_function();
+ generic_smp_call_function_interrupt();
+
+ if (test_bit(ec_call_function_single, &bits))
+ generic_smp_call_function_single_interrupt();
}
/*
udelay(10);
}
+void arch_send_call_function_ipi(cpumask_t mask)
+{
+ int cpu;
+
+ for_each_cpu_mask(cpu, mask)
+ smp_ext_bitcall(cpu, ec_call_function);
+}
+
+void arch_send_call_function_single_ipi(int cpu)
+{
+ smp_ext_bitcall(cpu, ec_call_function_single);
+}
+
#ifndef CONFIG_64BIT
/*
* this function sends a 'purge tlb' signal to another CPU.
if (ipl_info.type != IPL_TYPE_FCP_DUMP)
return;
if (cpu >= NR_CPUS) {
- printk(KERN_WARNING "Registers for cpu %i not saved since dump "
- "kernel was compiled with NR_CPUS=%i\n", cpu, NR_CPUS);
+ pr_warning("CPU %i exceeds the maximum %i and is excluded from "
+ "the dump\n", cpu, NR_CPUS - 1);
return;
}
zfcpdump_save_areas[cpu] = kmalloc(sizeof(union save_area), GFP_KERNEL);
}
out:
kfree(info);
- printk(KERN_INFO "CPUs: %d configured, %d standby\n", c_cpus, s_cpus);
+ pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus);
get_online_cpus();
__smp_rescan_cpus();
put_online_cpus();
preempt_disable();
/* Enable TOD clock interrupts on the secondary cpu. */
init_cpu_timer();
-#ifdef CONFIG_VIRT_TIMER
/* Enable cpu timer interrupts on the secondary cpu. */
init_cpu_vtimer();
-#endif
/* Enable pfault pseudo page faults on this cpu. */
pfault_init();
/* call cpu notifiers */
notify_cpu_starting(smp_processor_id());
/* Mark this cpu as online */
- spin_lock(&call_lock);
+ ipi_call_lock();
cpu_set(smp_processor_id(), cpu_online_map);
- spin_unlock(&call_lock);
+ ipi_call_unlock();
/* Switch on interrupts */
local_irq_enable();
/* Print info about this processor */
save_area = get_zeroed_page(GFP_KERNEL);
if (!save_area)
- goto out_save_area;
+ goto out;
lowcore->extended_save_area_addr = (u32) save_area;
}
#endif
lowcore_ptr[cpu] = lowcore;
return 0;
-#ifndef CONFIG_64BIT
-out_save_area:
- free_page(panic_stack);
-#endif
out:
+ free_page(panic_stack);
free_pages(async_stack, ASYNC_ORDER);
free_pages((unsigned long) lowcore, lc_order);
return -ENOMEM;
ccode = signal_processor_p((__u32)(unsigned long)(lowcore_ptr[cpu]),
cpu, sigp_set_prefix);
- if (ccode) {
- printk("sigp_set_prefix failed for cpu %d "
- "with condition code %d\n",
- (int) cpu, (int) ccode);
+ if (ccode)
return -EIO;
- }
idle = current_set[cpu];
cpu_lowcore = lowcore_ptr[cpu];
while (!smp_cpu_not_running(cpu))
cpu_relax();
smp_free_lowcore(cpu);
- printk(KERN_INFO "Processor %d spun down\n", cpu);
+ pr_info("Processor %d stopped\n", cpu);
}
void cpu_die(void)
* Copyright (C) 1991, 1992, 1995 Linus Torvalds
*/
+#define KMSG_COMPONENT "time"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/stop_machine.h>
#include <linux/time.h>
#include <linux/sysdev.h>
#include <linux/delay.h>
#include <asm/delay.h>
#include <asm/s390_ext.h>
#include <asm/div64.h>
+#include <asm/vdso.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/timer.h>
cd->min_delta_ns = 1;
cd->max_delta_ns = LONG_MAX;
cd->rating = 400;
- cd->cpumask = cpumask_of_cpu(cpu);
+ cd->cpumask = cpumask_of(cpu);
cd->set_next_event = s390_next_event;
cd->set_mode = s390_set_mode;
};
+void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
+{
+ if (clock != &clocksource_tod)
+ return;
+
+ /* Make userspace gettimeofday spin until we're done. */
+ ++vdso_data->tb_update_count;
+ smp_wmb();
+ vdso_data->xtime_tod_stamp = clock->cycle_last;
+ vdso_data->xtime_clock_sec = xtime.tv_sec;
+ vdso_data->xtime_clock_nsec = xtime.tv_nsec;
+ vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec;
+ vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec;
+ smp_wmb();
+ ++vdso_data->tb_update_count;
+}
+
+extern struct timezone sys_tz;
+
+void update_vsyscall_tz(void)
+{
+ /* Make userspace gettimeofday spin until we're done. */
+ ++vdso_data->tb_update_count;
+ smp_wmb();
+ vdso_data->tz_minuteswest = sys_tz.tz_minuteswest;
+ vdso_data->tz_dsttime = sys_tz.tz_dsttime;
+ smp_wmb();
+ ++vdso_data->tb_update_count;
+}
+
/*
* Initialize the TOD clock and the CPU timer of
* the boot cpu.
/* Enable TOD clock interrupts on the boot cpu. */
init_cpu_timer();
-
-#ifdef CONFIG_VIRT_TIMER
+ /* Enable cpu timer interrupts on the boot cpu. */
vtime_init();
-#endif
}
/*
}
sched_clock_base_cc += delta;
if (adjust.offset != 0) {
- printk(KERN_NOTICE "etr: time adjusted by %li micro-seconds\n",
- adjust.offset);
+ pr_notice("The ETR interface has adjusted the clock "
+ "by %li microseconds\n", adjust.offset);
adjust.modes = ADJ_OFFSET_SINGLESHOT;
do_adjtimex(&adjust);
}
atomic_set_mask(0x80000000, sw_ptr);
}
+/* Single threaded workqueue used for etr and stp sync events */
+static struct workqueue_struct *time_sync_wq;
+
+static void __init time_init_wq(void)
+{
+ if (!time_sync_wq)
+ time_sync_wq = create_singlethread_workqueue("timesync");
+}
+
/*
* External Time Reference (ETR) code.
*/
static void etr_timeout(unsigned long dummy);
static void etr_work_fn(struct work_struct *work);
+static DEFINE_MUTEX(etr_work_mutex);
static DECLARE_WORK(etr_work, etr_work_fn);
/*
etr_tolec = get_clock();
set_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags);
} else if (etr_port0_online || etr_port1_online) {
- printk(KERN_WARNING "Running on non ETR capable "
- "machine, only local mode available.\n");
+ pr_warning("The real or virtual hardware system does "
+ "not provide an ETR interface\n");
etr_port0_online = etr_port1_online = 0;
}
}
if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags))
return 0;
+ time_init_wq();
/* Check if this machine has the steai instruction. */
if (etr_steai(&aib, ETR_STEAI_STEPPING_PORT) == 0)
etr_steai_available = 1;
setup_timer(&etr_timer, etr_timeout, 0UL);
if (etr_port0_online) {
set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
}
if (etr_port1_online) {
set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
}
return 0;
}
if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
disable_sync_clock(NULL);
set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
}
/*
if (test_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
disable_sync_clock(NULL);
set_bit(ETR_EVENT_SYNC_CHECK, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
}
/*
* Both ports are not up-to-date now.
*/
set_bit(ETR_EVENT_PORT_ALERT, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
}
static void etr_timeout(unsigned long dummy)
{
set_bit(ETR_EVENT_UPDATE, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
}
/*
}
struct clock_sync_data {
+ atomic_t cpus;
int in_sync;
unsigned long long fixup_cc;
+ int etr_port;
+ struct etr_aib *etr_aib;
};
-static void clock_sync_cpu_start(void *dummy)
+static void clock_sync_cpu(struct clock_sync_data *sync)
{
- struct clock_sync_data *sync = dummy;
-
+ atomic_dec(&sync->cpus);
enable_sync_clock();
/*
* This looks like a busy wait loop but it isn't. etr_sync_cpus
fixup_clock_comparator(sync->fixup_cc);
}
-static void clock_sync_cpu_end(void *dummy)
-{
-}
-
/*
* Sync the TOD clock using the port refered to by aibp. This port
* has to be enabled and the other port has to be disabled. The
* last eacr update has to be more than 1.6 seconds in the past.
*/
-static int etr_sync_clock(struct etr_aib *aib, int port)
+static int etr_sync_clock(void *data)
{
- struct etr_aib *sync_port;
- struct clock_sync_data etr_sync;
+ static int first;
unsigned long long clock, old_clock, delay, delta;
- int follows;
+ struct clock_sync_data *etr_sync;
+ struct etr_aib *sync_port, *aib;
+ int port;
int rc;
- /* Check if the current aib is adjacent to the sync port aib. */
- sync_port = (port == 0) ? &etr_port0 : &etr_port1;
- follows = etr_aib_follows(sync_port, aib, port);
- memcpy(sync_port, aib, sizeof(*aib));
- if (!follows)
- return -EAGAIN;
+ etr_sync = data;
- /*
- * Catch all other cpus and make them wait until we have
- * successfully synced the clock. smp_call_function will
- * return after all other cpus are in etr_sync_cpu_start.
- */
- memset(&etr_sync, 0, sizeof(etr_sync));
- preempt_disable();
- smp_call_function(clock_sync_cpu_start, &etr_sync, 0);
- local_irq_disable();
+ if (xchg(&first, 1) == 1) {
+ /* Slave */
+ clock_sync_cpu(etr_sync);
+ return 0;
+ }
+
+ /* Wait until all other cpus entered the sync function. */
+ while (atomic_read(&etr_sync->cpus) != 0)
+ cpu_relax();
+
+ port = etr_sync->etr_port;
+ aib = etr_sync->etr_aib;
+ sync_port = (port == 0) ? &etr_port0 : &etr_port1;
enable_sync_clock();
/* Set clock to next OTE. */
delay = (unsigned long long)
(aib->edf2.etv - sync_port->edf2.etv) << 32;
delta = adjust_time(old_clock, clock, delay);
- etr_sync.fixup_cc = delta;
+ etr_sync->fixup_cc = delta;
fixup_clock_comparator(delta);
/* Verify that the clock is properly set. */
if (!etr_aib_follows(sync_port, aib, port)) {
/* Didn't work. */
disable_sync_clock(NULL);
- etr_sync.in_sync = -EAGAIN;
+ etr_sync->in_sync = -EAGAIN;
rc = -EAGAIN;
} else {
- etr_sync.in_sync = 1;
+ etr_sync->in_sync = 1;
rc = 0;
}
} else {
__ctl_clear_bit(0, 29);
__ctl_clear_bit(14, 21);
disable_sync_clock(NULL);
- etr_sync.in_sync = -EAGAIN;
+ etr_sync->in_sync = -EAGAIN;
rc = -EAGAIN;
}
- local_irq_enable();
- smp_call_function(clock_sync_cpu_end, NULL, 0);
- preempt_enable();
+ xchg(&first, 0);
+ return rc;
+}
+
+static int etr_sync_clock_stop(struct etr_aib *aib, int port)
+{
+ struct clock_sync_data etr_sync;
+ struct etr_aib *sync_port;
+ int follows;
+ int rc;
+
+ /* Check if the current aib is adjacent to the sync port aib. */
+ sync_port = (port == 0) ? &etr_port0 : &etr_port1;
+ follows = etr_aib_follows(sync_port, aib, port);
+ memcpy(sync_port, aib, sizeof(*aib));
+ if (!follows)
+ return -EAGAIN;
+ memset(&etr_sync, 0, sizeof(etr_sync));
+ etr_sync.etr_aib = aib;
+ etr_sync.etr_port = port;
+ get_online_cpus();
+ atomic_set(&etr_sync.cpus, num_online_cpus() - 1);
+ rc = stop_machine(etr_sync_clock, &etr_sync, &cpu_online_map);
+ put_online_cpus();
return rc;
}
}
/*
- * ETR tasklet. In this function you'll find the main logic. In
+ * ETR work. In this function you'll find the main logic. In
* particular this is the only function that calls etr_update_eacr(),
* it "controls" the etr control register.
*/
struct etr_aib aib;
int sync_port;
+ /* prevent multiple execution. */
+ mutex_lock(&etr_work_mutex);
+
/* Create working copy of etr_eacr. */
eacr = etr_eacr;
del_timer_sync(&etr_timer);
etr_update_eacr(eacr);
clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
- return;
+ goto out_unlock;
}
/* Store aib to get the current ETR status word. */
eacr.es || sync_port < 0) {
etr_update_eacr(eacr);
etr_set_tolec_timeout(now);
- return;
+ goto out_unlock;
}
/*
etr_update_eacr(eacr);
set_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
if (now < etr_tolec + (1600000 << 12) ||
- etr_sync_clock(&aib, sync_port) != 0) {
+ etr_sync_clock_stop(&aib, sync_port) != 0) {
/* Sync failed. Try again in 1/2 second. */
eacr.es = 0;
etr_update_eacr(eacr);
etr_set_sync_timeout();
} else
etr_set_tolec_timeout(now);
+out_unlock:
+ mutex_unlock(&etr_work_mutex);
}
/*
return count; /* Nothing to do. */
etr_port0_online = value;
set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
} else {
if (etr_port1_online == value)
return count; /* Nothing to do. */
etr_port1_online = value;
set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
}
return count;
}
static void *stp_page;
static void stp_work_fn(struct work_struct *work);
+static DEFINE_MUTEX(stp_work_mutex);
static DECLARE_WORK(stp_work, stp_work_fn);
static int __init early_parse_stp(char *p)
if (rc == 0)
set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags);
else if (stp_online) {
- printk(KERN_WARNING "Running on non STP capable machine.\n");
+ pr_warning("The real or virtual hardware system does "
+ "not provide an STP interface\n");
free_bootmem((unsigned long) stp_page, PAGE_SIZE);
stp_page = NULL;
stp_online = 0;
static int __init stp_init(void)
{
- if (test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags) && stp_online)
- schedule_work(&stp_work);
+ if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
+ return 0;
+ time_init_wq();
+ if (!stp_online)
+ return 0;
+ queue_work(time_sync_wq, &stp_work);
return 0;
}
static void stp_timing_alert(struct stp_irq_parm *intparm)
{
if (intparm->tsc || intparm->lac || intparm->tcpc)
- schedule_work(&stp_work);
+ queue_work(time_sync_wq, &stp_work);
}
/*
if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags))
return;
disable_sync_clock(NULL);
- schedule_work(&stp_work);
+ queue_work(time_sync_wq, &stp_work);
}
/*
if (!test_bit(CLOCK_SYNC_STP, &clock_sync_flags))
return;
disable_sync_clock(NULL);
- schedule_work(&stp_work);
+ queue_work(time_sync_wq, &stp_work);
}
-/*
- * STP tasklet. Check for the STP state and take over the clock
- * synchronization if the STP clock source is usable.
- */
-static void stp_work_fn(struct work_struct *work)
+
+static int stp_sync_clock(void *data)
{
- struct clock_sync_data stp_sync;
+ static int first;
unsigned long long old_clock, delta;
+ struct clock_sync_data *stp_sync;
int rc;
- if (!stp_online) {
- chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
- return;
- }
+ stp_sync = data;
- rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0);
- if (rc)
- return;
+ if (xchg(&first, 1) == 1) {
+ /* Slave */
+ clock_sync_cpu(stp_sync);
+ return 0;
+ }
- rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
- if (rc || stp_info.c == 0)
- return;
+ /* Wait until all other cpus entered the sync function. */
+ while (atomic_read(&stp_sync->cpus) != 0)
+ cpu_relax();
- /*
- * Catch all other cpus and make them wait until we have
- * successfully synced the clock. smp_call_function will
- * return after all other cpus are in clock_sync_cpu_start.
- */
- memset(&stp_sync, 0, sizeof(stp_sync));
- preempt_disable();
- smp_call_function(clock_sync_cpu_start, &stp_sync, 0);
- local_irq_disable();
enable_sync_clock();
set_bit(CLOCK_SYNC_STP, &clock_sync_flags);
if (test_and_clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags))
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
rc = 0;
if (stp_info.todoff[0] || stp_info.todoff[1] ||
}
if (rc) {
disable_sync_clock(NULL);
- stp_sync.in_sync = -EAGAIN;
+ stp_sync->in_sync = -EAGAIN;
clear_bit(CLOCK_SYNC_STP, &clock_sync_flags);
if (etr_port0_online || etr_port1_online)
- schedule_work(&etr_work);
+ queue_work(time_sync_wq, &etr_work);
} else
- stp_sync.in_sync = 1;
+ stp_sync->in_sync = 1;
+ xchg(&first, 0);
+ return 0;
+}
+
+/*
+ * STP work. Check for the STP state and take over the clock
+ * synchronization if the STP clock source is usable.
+ */
+static void stp_work_fn(struct work_struct *work)
+{
+ struct clock_sync_data stp_sync;
+ int rc;
+
+ /* prevent multiple execution. */
+ mutex_lock(&stp_work_mutex);
+
+ if (!stp_online) {
+ chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
+ goto out_unlock;
+ }
+
+ rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0);
+ if (rc)
+ goto out_unlock;
+
+ rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi));
+ if (rc || stp_info.c == 0)
+ goto out_unlock;
+
+ memset(&stp_sync, 0, sizeof(stp_sync));
+ get_online_cpus();
+ atomic_set(&stp_sync.cpus, num_online_cpus() - 1);
+ stop_machine(stp_sync_clock, &stp_sync, &cpu_online_map);
+ put_online_cpus();
- local_irq_enable();
- smp_call_function(clock_sync_cpu_end, NULL, 0);
- preempt_enable();
+out_unlock:
+ mutex_unlock(&stp_work_mutex);
}
/*
if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
return -EOPNOTSUPP;
stp_online = value;
- schedule_work(&stp_work);
+ queue_work(time_sync_wq, &stp_work);
return count;
}
}
}
- static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask)
+ static void sun4u_set_affinity(unsigned int virt_irq,
+ const struct cpumask *mask)
{
sun4u_irq_enable(virt_irq);
}
ino, err);
}
- static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask)
+ static void sun4v_set_affinity(unsigned int virt_irq,
+ const struct cpumask *mask)
{
unsigned int ino = virt_irq_table[virt_irq].dev_ino;
unsigned long cpuid = irq_choose_cpu(virt_irq);
dev_handle, dev_ino, err);
}
- static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask)
+ static void sun4v_virt_set_affinity(unsigned int virt_irq,
+ const struct cpumask *mask)
{
unsigned long cpuid, dev_handle, dev_ino;
int err;
local_irq_restore(flags);
}
+static void unhandled_perf_irq(struct pt_regs *regs)
+{
+ unsigned long pcr, pic;
+
+ read_pcr(pcr);
+ read_pic(pic);
+
+ write_pcr(0);
+
+ printk(KERN_EMERG "CPU %d: Got unexpected perf counter IRQ.\n",
+ smp_processor_id());
+ printk(KERN_EMERG "CPU %d: PCR[%016lx] PIC[%016lx]\n",
+ smp_processor_id(), pcr, pic);
+}
+
+/* Almost a direct copy of the powerpc PMC code. */
+static DEFINE_SPINLOCK(perf_irq_lock);
+static void *perf_irq_owner_caller; /* mostly for debugging */
+static void (*perf_irq)(struct pt_regs *regs) = unhandled_perf_irq;
+
+/* Invoked from level 15 PIL handler in trap table. */
+void perfctr_irq(int irq, struct pt_regs *regs)
+{
+ clear_softint(1 << irq);
+ perf_irq(regs);
+}
+
+int register_perfctr_intr(void (*handler)(struct pt_regs *))
+{
+ int ret;
+
+ if (!handler)
+ return -EINVAL;
+
+ spin_lock(&perf_irq_lock);
+ if (perf_irq != unhandled_perf_irq) {
+ printk(KERN_WARNING "register_perfctr_intr: "
+ "perf IRQ busy (reserved by caller %p)\n",
+ perf_irq_owner_caller);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ perf_irq_owner_caller = __builtin_return_address(0);
+ perf_irq = handler;
+
+ ret = 0;
+out:
+ spin_unlock(&perf_irq_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_perfctr_intr);
+
+void release_perfctr_intr(void (*handler)(struct pt_regs *))
+{
+ spin_lock(&perf_irq_lock);
+ perf_irq_owner_caller = NULL;
+ perf_irq = unhandled_perf_irq;
+ spin_unlock(&perf_irq_lock);
+}
+EXPORT_SYMBOL_GPL(release_perfctr_intr);
+
#ifdef CONFIG_HOTPLUG_CPU
void fixup_irqs(void)
{
!(irq_desc[irq].status & IRQ_PER_CPU)) {
if (irq_desc[irq].chip->set_affinity)
irq_desc[irq].chip->set_affinity(irq,
- irq_desc[irq].affinity);
+ &irq_desc[irq].affinity);
}
spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
}
if (nid != -1) {
cpumask_t numa_mask = node_to_cpumask(nid);
- irq_set_affinity(irq, numa_mask);
+ irq_set_affinity(irq, &numa_mask);
}
return irq;
irq = of_get_property(dp, "interrupts", &len);
if (irq) {
- memcpy(op->irqs, irq, len);
op->num_irqs = len / 4;
+
+ /* Prevent overrunning the op->irqs[] array. */
+ if (op->num_irqs > PROMINTR_MAX) {
+ printk(KERN_WARNING "%s: Too many irqs (%d), "
+ "limiting to %d.\n",
+ dp->full_name, op->num_irqs, PROMINTR_MAX);
+ op->num_irqs = PROMINTR_MAX;
+ }
+ memcpy(op->irqs, irq, op->num_irqs * 4);
} else {
op->num_irqs = 0;
}
- /* Prevent overrunning the op->irqs[] array. */
- if (op->num_irqs > PROMINTR_MAX) {
- printk(KERN_WARNING "%s: Too many irqs (%d), "
- "limiting to %d.\n",
- dp->full_name, op->num_irqs, PROMINTR_MAX);
- op->num_irqs = PROMINTR_MAX;
- }
-
build_device_resources(op, parent);
for (i = 0; i < op->num_irqs; i++)
op->irqs[i] = build_one_device_irq(op, parent, op->irqs[i]);
if (nid != -1) {
cpumask_t numa_mask = node_to_cpumask(nid);
- irq_set_affinity(irq, numa_mask);
+ irq_set_affinity(irq, &numa_mask);
}
err = request_irq(irq, sparc64_msiq_interrupt, 0,
"MSIQ",
unsigned char boot_cpu_id = 0;
unsigned char boot_cpu_id4 = 0; /* boot_cpu_id << 2 */
- cpumask_t cpu_online_map = CPU_MASK_NONE;
- cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
cpumask_t smp_commenced_mask = CPU_MASK_NONE;
/* The only guaranteed locking primitive available on all Sparc
instance = 0;
while (!cpu_find_by_instance(instance, NULL, &mid)) {
if (mid < NR_CPUS) {
- cpu_set(mid, phys_cpu_present_map);
+ cpu_set(mid, cpu_possible_map);
cpu_set(mid, cpu_present_map);
}
instance++;
current_thread_info()->cpu = cpuid;
cpu_set(cpuid, cpu_online_map);
- cpu_set(cpuid, phys_cpu_present_map);
+ cpu_set(cpuid, cpu_possible_map);
}
int __cpuinit __cpu_up(unsigned int cpu)
int sparc64_multi_core __read_mostly;
- cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE;
- cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
DEFINE_PER_CPU(cpumask_t, cpu_sibling_map) = CPU_MASK_NONE;
cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
- EXPORT_SYMBOL(cpu_possible_map);
- EXPORT_SYMBOL(cpu_online_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
EXPORT_SYMBOL(cpu_core_map);
for (i = 0; i < NUM_ITERS; i++) {
t0 = tick_ops->get_tick();
go[MASTER] = 1;
- membar_storeload();
+ membar_safe("#StoreLoad");
while (!(tm = go[SLAVE]))
rmb();
go[SLAVE] = 0;
/* now let the client proceed into his loop */
go[MASTER] = 0;
- membar_storeload();
+ membar_safe("#StoreLoad");
spin_lock_irqsave(&itc_sync_lock, flags);
{
go[MASTER] = 0;
wmb();
go[SLAVE] = tick_ops->get_tick();
- membar_storeload();
+ membar_safe("#StoreLoad");
}
}
spin_unlock_irqrestore(&itc_sync_lock, flags);
/* Setup the initial cpu list. */
cnt = 0;
- for_each_cpu_mask_nr(i, *mask) {
+ for_each_cpu(i, mask) {
if (i == this_cpu || !cpu_online(i))
continue;
cpu_list[cnt++] = i;
smp_processor_id());
#endif
penguins_are_doing_time = 1;
- membar_storestore_loadstore();
atomic_inc(&smp_capture_registry);
smp_cross_call(&xcall_capture, 0, 0, 0);
while (atomic_read(&smp_capture_registry) != ncpus)
smp_processor_id());
#endif
penguins_are_doing_time = 0;
- membar_storeload_storestore();
+ membar_safe("#StoreLoad");
atomic_dec(&smp_capture_registry);
}
}
-/* Imprisoned penguins run with %pil == 15, but PSTATE_IE set, so they
- * can service tlb flush xcalls...
+/* Imprisoned penguins run with %pil == PIL_NORMAL_MAX, but PSTATE_IE
+ * set, so they can service tlb flush xcalls...
*/
extern void prom_world(int);
__asm__ __volatile__("flushw");
prom_world(1);
atomic_inc(&smp_capture_registry);
- membar_storeload_storestore();
+ membar_safe("#StoreLoad");
while (penguins_are_doing_time)
rmb();
atomic_dec(&smp_capture_registry);
extern void *__bzero(void *, size_t);
extern void *__memscan_zero(void *, size_t);
extern void *__memscan_generic(void *, int, size_t);
-extern int __memcmp(const void *, const void *, __kernel_size_t);
extern int __strncmp(const char *, const char *, __kernel_size_t);
extern int __ashrdi3(int, int);
#ifdef CONFIG_SMP
/* IRQ implementation. */
EXPORT_SYMBOL(synchronize_irq);
-
- /* CPU online map and active count. */
- EXPORT_SYMBOL(cpu_online_map);
- EXPORT_SYMBOL(phys_cpu_present_map);
#endif
EXPORT_SYMBOL(__udelay);
EXPORT_SYMBOL(__ndelay);
EXPORT_SYMBOL(rtc_lock);
-#ifdef CONFIG_SUN_AUXIO
EXPORT_SYMBOL(set_auxio);
EXPORT_SYMBOL(get_auxio);
-#endif
EXPORT_SYMBOL(io_remap_pfn_range);
#ifndef CONFIG_SMP
EXPORT_SYMBOL(__bzero);
EXPORT_SYMBOL(__memscan_zero);
EXPORT_SYMBOL(__memscan_generic);
-EXPORT_SYMBOL(__memcmp);
EXPORT_SYMBOL(__strncmp);
EXPORT_SYMBOL(__memmove);
sevt = &__get_cpu_var(sparc64_events);
memcpy(sevt, &sparc64_clockevent, sizeof(*sevt));
- sevt->cpumask = cpumask_of_cpu(smp_processor_id());
+ sevt->cpumask = cpumask_of(smp_processor_id());
clockevents_register_device(sevt);
}
config X86
def_bool y
select HAVE_AOUT if X86_32
+ select HAVE_READQ
+ select HAVE_WRITEQ
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
config GENERIC_BUG
def_bool y
depends on BUG
+ select GENERIC_BUG_RELATIVE_POINTERS if X86_64
+
+config GENERIC_BUG_RELATIVE_POINTERS
+ bool
config GENERIC_HWEIGHT
def_bool y
config SPARSE_IRQ
bool "Support sparse irq numbering"
depends on PCI_MSI || HT_IRQ
- default y
help
- This enables support for sparse irq, esp for msi/msi-x. You may need
- if you have lots of cards supports msi-x installed.
+ This enables support for sparse irqs. This is useful for distro
+ kernels that want to define a high CONFIG_NR_CPUS value but still
+ want to have low kernel memory footprint on smaller machines.
- If you don't know what to do here, say Y.
+ ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+ out the irq_desc[] array in a more NUMA-friendly way. )
+
+ If you don't know what to do here, say N.
config NUMA_MIGRATE_IRQ_DESC
bool "Move irq desc when changing irq smp_affinity"
- depends on SPARSE_IRQ && SMP
+ depends on SPARSE_IRQ && NUMA
default n
help
This enables moving irq_desc to cpu/node that irq will use handled.
def_bool y
depends on X86_MPPARSE || X86_VOYAGER
-if ACPI
config X86_MPPARSE
- def_bool y
- bool "Enable MPS table"
+ bool "Enable MPS table" if ACPI
+ default y
depends on X86_LOCAL_APIC
help
For old smp systems that do not have proper acpi support. Newer systems
(esp with 64bit cpus) with acpi support, MADT and DSDT will override it
-endif
-
-if !ACPI
-config X86_MPPARSE
- def_bool y
- depends on X86_LOCAL_APIC
-endif
choice
prompt "Subarchitecture Type"
The HPET provides a stable time base on SMP
systems, unlike the TSC, but it is more expensive to access,
as it is off-chip. You can find the HPET spec at
- <http://www.intel.com/hardwaredesign/hpetspec.htm>.
+ <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
You can safely choose Y here. However, HPET will only be
activated if the platform and the BIOS support this feature.
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
- bool
+ def_bool y if X86_64
help
Support for software bounce buffers used on x86-64 systems
which don't have a hardware IOMMU (e.g. the current generation
config MAXSMP
bool "Configure Maximum number of SMP Processors and NUMA Nodes"
- depends on X86_64 && SMP && BROKEN
+ depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+ select CPUMASK_OFFSTACK
default n
help
Configure maximum number of CPUS and NUMA Nodes for this architecture.
If unsure, say N.
config NR_CPUS
- int "Maximum number of CPUs (2-512)" if !MAXSMP
- range 2 512
- depends on SMP
+ int "Maximum number of CPUs" if SMP && !MAXSMP
+ range 2 512 if SMP && !MAXSMP
+ default "1" if !SMP
default "4096" if MAXSMP
- default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
- default "8"
+ default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+ default "8" if SMP
help
This allows you to specify the maximum number of CPUs which this
kernel will support. The maximum supported value is 512 and the
def_bool y
depends on X86_32 && X86_VISWS
+config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
+ bool "Reroute for broken boot IRQs"
+ default n
+ depends on X86_IO_APIC
+ help
+ This option enables a workaround that fixes a source of
+ spurious interrupts. This is recommended when threaded
+ interrupt handling is used on systems where the generation of
+ superfluous "boot interrupts" cannot be disabled.
+
+ Some chipsets generate a legacy INTx "boot IRQ" when the IRQ
+ entry in the chipset's IO-APIC is masked (as, e.g. the RT
+ kernel does during interrupt handling). On chipsets where this
+ boot IRQ generation cannot be disabled, this workaround keeps
+ the original IRQ line masked so that only the equivalent "boot
+ IRQ" is delivered to the CPUs. The workaround also tells the
+ kernel to set up the IRQ handler on the boot IRQ line. In this
+ way only one interrupt is delivered to the kernel. Otherwise
+ the spurious second interrupt may cause the kernel to bring
+ down (vital) interrupt lines.
+
+ Only affects "broken" chipsets. Interrupt sharing may be
+ increased on these systems.
+
config X86_MCE
bool "Machine Check Exception"
depends on !X86_VOYAGER
config ARCH_PHYS_ADDR_T_64BIT
def_bool X86_64 || X86_PAE
+config DIRECT_GBPAGES
+ bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ default y
+ depends on X86_64
+ help
+ Allow the kernel linear mapping to use 1GB pages on CPUs that
+ support it. This can improve the kernel's performance a tiny bit by
+ reducing TLB pressure. If in doubt, say "Y".
+
# Common NUMA Features
config NUMA
- bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
+ bool "Numa Memory Allocation and Scheduler Support"
depends on SMP
depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL)
default n if X86_PC
default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP)
help
Enable NUMA (Non Uniform Memory Access) support.
+
The kernel will try to allocate memory used by a CPU on the
local memory controller of the CPU and add some more
NUMA awareness to the kernel.
- For 32-bit this is currently highly experimental and should be only
- used for kernel development. It might also cause boot failures.
- For 64-bit this is recommended on all multiprocessor Opteron systems.
- If the system is EM64T, you should say N unless your system is
- EM64T NUMA.
+ For 64-bit this is recommended if the system is Intel Core i7
+ (or later), AMD Opteron, or EM64T NUMA.
+
+ For 32-bit this is only needed on (rare) 32-bit-only platforms
+ that support NUMA topologies, such as NUMAQ / Summit, or if you
+ boot a 32-bit kernel on a 64-bit NUMA platform.
+
+ Otherwise, you should say N.
comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
+config ARCH_ENABLE_MEMORY_HOTREMOVE
+ def_bool y
+ depends on MEMORY_HOTPLUG
+
config HAVE_ARCH_EARLY_PFN_TO_NID
def_bool X86_64
depends on NUMA
# endif
#endif
-#ifdef CONFIG_IRQBALANCE
-extern int irqbalance_disable(char *str);
-#endif
-
#ifdef CONFIG_HOTPLUG_CPU
#include <linux/cpumask.h>
- extern void fixup_irqs(cpumask_t map);
+ extern void fixup_irqs(void);
#endif
extern unsigned int do_IRQ(struct pt_regs *regs);
/* Interrupt vector management */
extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
+ extern int vector_used_by_percpu_irq(unsigned int vector);
#endif /* _ASM_X86_IRQ_H */
#include <linux/module.h>
#include <linux/dmi.h>
#include <linux/dmar.h>
+#include <linux/ftrace.h>
#include <asm/atomic.h>
#include <asm/smp.h>
int first_system_vector = 0xfe;
- char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
/*
* Debug level, exported for io_apic.c
*/
struct clock_event_device *evt);
static void lapic_timer_setup(enum clock_event_mode mode,
struct clock_event_device *evt);
- static void lapic_timer_broadcast(cpumask_t mask);
+ static void lapic_timer_broadcast(const cpumask_t *mask);
static void apic_pm_activate(void);
/*
/*
* Local APIC timer broadcast function
*/
- static void lapic_timer_broadcast(cpumask_t mask)
+ static void lapic_timer_broadcast(const cpumask_t *mask)
{
#ifdef CONFIG_SMP
send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
memcpy(levt, &lapic_clockevent, sizeof(*levt));
- levt->cpumask = cpumask_of_cpu(smp_processor_id());
+ levt->cpumask = cpumask_of(smp_processor_id());
clockevents_register_device(levt);
}
/*
* the NMI deadlock-detector uses this.
*/
-#ifdef CONFIG_X86_64
- add_pda(apic_timer_irqs, 1);
-#else
- per_cpu(irq_stat, cpu).apic_timer_irqs++;
-#endif
+ inc_irq_stat(apic_timer_irqs);
evt->event_handler(evt);
}
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
-void smp_apic_timer_interrupt(struct pt_regs *regs)
+void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
local_apic_timer_interrupt();
irq_exit();
{
u32 v;
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
/*
* Check if this really is a spurious interrupt and ACK it
if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
ack_APIC_irq();
-#ifdef CONFIG_X86_64
- add_pda(irq_spurious_count, 1);
-#else
+ inc_irq_stat(irq_spurious_count);
+
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
pr_info("spurious APIC interrupt on CPU#%d, "
"should never happen.\n", smp_processor_id());
- __get_cpu_var(irq_stat).irq_spurious_count++;
-#endif
irq_exit();
}
{
u32 v, v1;
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
/* First tickle the hardware, only then report what went on. -- REW */
v = apic_read(APIC_ESR);
void __cpuinit generic_processor_info(int apicid, int version)
{
int cpu;
- cpumask_t tmp_map;
/*
* Validate version
*/
if (version == 0x0) {
pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- version);
+ "fixing up to 0x10. (tell your hw vendor)\n",
+ version);
version = 0x10;
}
apic_version[apicid] = version;
- if (num_processors >= NR_CPUS) {
- pr_warning("WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
+ if (num_processors >= nr_cpu_ids) {
+ int max = nr_cpu_ids;
+ int thiscpu = max + disabled_cpus;
+
+ pr_warning(
+ "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+ " Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+ disabled_cpus++;
return;
}
num_processors++;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
+ cpu = cpumask_next_zero(-1, cpu_present_mask);
physid_set(apicid, phys_cpu_present_map);
if (apicid == boot_cpu_physical_apicid) {
}
#endif
- cpu_set(cpu, cpu_possible_map);
- cpu_set(cpu, cpu_present_map);
+ set_cpu_possible(cpu, true);
+ set_cpu_present(cpu, true);
}
#ifdef CONFIG_X86_64
bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
/* are we being called early in kernel startup? */
if (bios_cpu_apicid) {
id = bios_cpu_apicid[i];
per_cpu(cpuid4_info, cpu) = NULL;
}
- static int __cpuinit detect_cache_attributes(unsigned int cpu)
+ static void get_cpu_leaves(void *_retval)
{
- struct _cpuid4_info *this_leaf;
- unsigned long j;
- int retval;
- cpumask_t oldmask;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- oldmask = current->cpus_allowed;
- retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
- if (retval)
- goto out;
+ int j, *retval = _retval, cpu = smp_processor_id();
/* Do cpuid and store the results */
for (j = 0; j < num_cache_leaves; j++) {
+ struct _cpuid4_info *this_leaf;
this_leaf = CPUID4_INFO_IDX(cpu, j);
- retval = cpuid4_cache_lookup(j, this_leaf);
- if (unlikely(retval < 0)) {
+ *retval = cpuid4_cache_lookup(j, this_leaf);
+ if (unlikely(*retval < 0)) {
int i;
for (i = 0; i < j; i++)
}
cache_shared_cpu_map_setup(cpu, j);
}
- set_cpus_allowed_ptr(current, &oldmask);
+ }
+
+ static int __cpuinit detect_cache_attributes(unsigned int cpu)
+ {
+ int retval;
+
+ if (num_cache_leaves == 0)
+ return -ENOENT;
+
+ per_cpu(cpuid4_info, cpu) = kzalloc(
+ sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
+ if (per_cpu(cpuid4_info, cpu) == NULL)
+ return -ENOMEM;
- out:
+ smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
if (retval) {
kfree(per_cpu(cpuid4_info, cpu));
per_cpu(cpuid4_info, cpu) = NULL;
cpumask_t *mask = &this_leaf->shared_cpu_map;
n = type?
- cpulist_scnprintf(buf, len-2, *mask):
- cpumask_scnprintf(buf, len-2, *mask);
+ cpulist_scnprintf(buf, len-2, mask) :
+ cpumask_scnprintf(buf, len-2, mask);
buf[n++] = '\n';
buf[n] = '\0';
}
return show_shared_cpu_map_func(leaf, 1, buf);
}
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
- switch(this_leaf->eax.split.type) {
- case CACHE_TYPE_DATA:
+static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
+{
+ switch (this_leaf->eax.split.type) {
+ case CACHE_TYPE_DATA:
return sprintf(buf, "Data\n");
- break;
- case CACHE_TYPE_INST:
+ case CACHE_TYPE_INST:
return sprintf(buf, "Instruction\n");
- break;
- case CACHE_TYPE_UNIFIED:
+ case CACHE_TYPE_UNIFIED:
return sprintf(buf, "Unified\n");
- break;
- default:
+ default:
return sprintf(buf, "Unknown\n");
- break;
}
}
* CPU Initialization
*/
+ struct thresh_restart {
+ struct threshold_block *b;
+ int reset;
+ u16 old_limit;
+ };
+
/* must be called with correct cpu affinity */
- static void threshold_restart_bank(struct threshold_block *b,
- int reset, u16 old_limit)
+ static long threshold_restart_bank(void *_tr)
{
+ struct thresh_restart *tr = _tr;
u32 mci_misc_hi, mci_misc_lo;
- rdmsr(b->address, mci_misc_lo, mci_misc_hi);
+ rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
- if (b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
- reset = 1; /* limit cannot be lower than err count */
+ if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ tr->reset = 1; /* limit cannot be lower than err count */
- if (reset) { /* reset err count and overflow bit */
+ if (tr->reset) { /* reset err count and overflow bit */
mci_misc_hi =
(mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
- (THRESHOLD_MAX - b->threshold_limit);
- } else if (old_limit) { /* change limit w/o reset */
+ (THRESHOLD_MAX - tr->b->threshold_limit);
+ } else if (tr->old_limit) { /* change limit w/o reset */
int new_count = (mci_misc_hi & THRESHOLD_MAX) +
- (old_limit - b->threshold_limit);
+ (tr->old_limit - tr->b->threshold_limit);
mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
- b->interrupt_enable ?
+ tr->b->interrupt_enable ?
(mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
(mci_misc_hi &= ~MASK_INT_TYPE_HI);
mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(b->address, mci_misc_lo, mci_misc_hi);
+ wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ return 0;
}
/* cpu init entry point, called from mce.c with preempt off */
unsigned int cpu = smp_processor_id();
u8 lvt_off;
u32 low = 0, high = 0, address = 0;
+ struct thresh_restart tr;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
wrmsr(address, low, high);
threshold_defaults.address = address;
- threshold_restart_bank(&threshold_defaults, 0, 0);
+ tr.b = &threshold_defaults;
+ tr.reset = 0;
+ tr.old_limit = 0;
+ threshold_restart_bank(&tr);
}
}
}
}
}
out:
- add_pda(irq_threshold_count, 1);
+ inc_irq_stat(irq_threshold_count);
irq_exit();
}
ssize_t(*store) (struct threshold_block *, const char *, size_t count);
};
- static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
- cpumask_t *newmask)
- {
- *oldmask = current->cpus_allowed;
- cpus_clear(*newmask);
- cpu_set(cpu, *newmask);
- set_cpus_allowed_ptr(current, newmask);
- }
-
- static void affinity_restore(const cpumask_t *oldmask)
- {
- set_cpus_allowed_ptr(current, oldmask);
- }
-
#define SHOW_FIELDS(name) \
static ssize_t show_ ## name(struct threshold_block * b, char *buf) \
{ \
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask, newmask;
+ struct thresh_restart tr;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
b->interrupt_enable = !!new;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 0, 0);
- affinity_restore(&oldmask);
+ tr.b = b;
+ tr.reset = 0;
+ tr.old_limit = 0;
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return end - buf;
}
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask, newmask;
- u16 old;
+ struct thresh_restart tr;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
new = THRESHOLD_MAX;
if (new < 1)
new = 1;
- old = b->threshold_limit;
+ tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
+ tr.b = b;
+ tr.reset = 0;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 0, old);
- affinity_restore(&oldmask);
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return end - buf;
}
- static ssize_t show_error_count(struct threshold_block *b, char *buf)
+ static long local_error_count(void *_b)
{
- u32 high, low;
- cpumask_t oldmask, newmask;
- affinity_set(b->cpu, &oldmask, &newmask);
+ struct threshold_block *b = _b;
+ u32 low, high;
+
rdmsr(b->address, low, high);
- affinity_restore(&oldmask);
- return sprintf(buf, "%x\n",
- (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
+ return (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
+ }
+
+ static ssize_t show_error_count(struct threshold_block *b, char *buf)
+ {
+ return sprintf(buf, "%lx\n", work_on_cpu(b->cpu, local_error_count, b));
}
static ssize_t store_error_count(struct threshold_block *b,
const char *buf, size_t count)
{
- cpumask_t oldmask, newmask;
- affinity_set(b->cpu, &oldmask, &newmask);
- threshold_restart_bank(b, 1, 0);
- affinity_restore(&oldmask);
+ struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
+
+ work_on_cpu(b->cpu, threshold_restart_bank, &tr);
return 1;
}
return err;
}
+ static long local_allocate_threshold_blocks(void *_bank)
+ {
+ unsigned int *bank = _bank;
+
+ return allocate_threshold_blocks(smp_processor_id(), *bank, 0,
+ MSR_IA32_MC0_MISC + *bank * 4);
+ }
+
/* symlinks sibling shared banks to first core. first core owns dir/files. */
static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
- cpumask_t oldmask, newmask;
char name[32];
sprintf(name, "threshold_bank%i", bank);
per_cpu(threshold_banks, cpu)[bank] = b;
- affinity_set(cpu, &oldmask, &newmask);
- err = allocate_threshold_blocks(cpu, bank, 0,
- MSR_IA32_MC0_MISC + bank * 4);
- affinity_restore(&oldmask);
-
+ err = work_on_cpu(cpu, local_allocate_threshold_blocks, &bank);
if (err)
goto out_free;
#include <linux/kernel.h>
#include <linux/threads.h>
+#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/hardirq.h>
+#include <linux/timer.h>
+#include <linux/proc_fs.h>
+#include <asm/current.h>
#include <asm/smp.h>
#include <asm/ipi.h>
#include <asm/genapic.h>
/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
- static cpumask_t uv_target_cpus(void)
+ static const struct cpumask *uv_target_cpus(void)
{
- return cpumask_of_cpu(0);
+ return cpumask_of(0);
}
- static cpumask_t uv_vector_allocation_domain(int cpu)
+ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ cpumask_clear(retmask);
+ cpumask_set_cpu(cpu, retmask);
}
int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
- static void uv_send_IPI_mask(cpumask_t mask, int vector)
+ static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
{
unsigned int cpu;
- for_each_possible_cpu(cpu)
- if (cpu_isset(cpu, mask))
+ for_each_cpu(cpu, mask)
+ uv_send_IPI_one(cpu, vector);
+ }
+
+ static void uv_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
+ {
+ unsigned int cpu;
+ unsigned int this_cpu = smp_processor_id();
+
+ for_each_cpu(cpu, mask)
+ if (cpu != this_cpu)
uv_send_IPI_one(cpu, vector);
}
static void uv_send_IPI_allbutself(int vector)
{
- cpumask_t mask = cpu_online_map;
-
- cpu_clear(smp_processor_id(), mask);
+ unsigned int cpu;
+ unsigned int this_cpu = smp_processor_id();
- if (!cpus_empty(mask))
- uv_send_IPI_mask(mask, vector);
+ for_each_online_cpu(cpu)
+ if (cpu != this_cpu)
+ uv_send_IPI_one(cpu, vector);
}
static void uv_send_IPI_all(int vector)
{
- uv_send_IPI_mask(cpu_online_map, vector);
+ uv_send_IPI_mask(cpu_online_mask, vector);
}
static int uv_apic_id_registered(void)
{
}
- static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
{
int cpu;
* We're using fixed IRQ delivery, can only return one phys APIC ID.
* May as well be the first.
*/
- cpu = first_cpu(cpumask);
+ cpu = cpumask_first(cpumask);
if ((unsigned)cpu < nr_cpu_ids)
return per_cpu(x86_cpu_to_apicid, cpu);
else
return BAD_APICID;
}
+ static unsigned int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ const struct cpumask *andmask)
+ {
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ for_each_cpu_and(cpu, cpumask, andmask)
+ if (cpumask_test_cpu(cpu, cpu_online_mask))
+ break;
+ if (cpu < nr_cpu_ids)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ return BAD_APICID;
+ }
+
static unsigned int get_apic_id(unsigned long x)
{
unsigned int id;
.send_IPI_all = uv_send_IPI_all,
.send_IPI_allbutself = uv_send_IPI_allbutself,
.send_IPI_mask = uv_send_IPI_mask,
+ .send_IPI_mask_allbutself = uv_send_IPI_mask_allbutself,
.send_IPI_self = uv_send_IPI_self,
.cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
.phys_pkg_id = phys_pkg_id,
.get_apic_id = get_apic_id,
.set_apic_id = set_apic_id,
sn_rtc_cycles_per_second = ticks_per_sec;
}
+/*
+ * percpu heartbeat timer
+ */
+static void uv_heartbeat(unsigned long ignored)
+{
+ struct timer_list *timer = &uv_hub_info->scir.timer;
+ unsigned char bits = uv_hub_info->scir.state;
+
+ /* flip heartbeat bit */
+ bits ^= SCIR_CPU_HEARTBEAT;
+
+ /* is this cpu idle? */
+ if (idle_cpu(raw_smp_processor_id()))
+ bits &= ~SCIR_CPU_ACTIVITY;
+ else
+ bits |= SCIR_CPU_ACTIVITY;
+
+ /* update system controller interface reg */
+ uv_set_scir_bits(bits);
+
+ /* enable next timer period */
+ mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL);
+}
+
+static void __cpuinit uv_heartbeat_enable(int cpu)
+{
+ if (!uv_cpu_hub_info(cpu)->scir.enabled) {
+ struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
+
+ uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
+ setup_timer(timer, uv_heartbeat, cpu);
+ timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
+ add_timer_on(timer, cpu);
+ uv_cpu_hub_info(cpu)->scir.enabled = 1;
+ }
+
+ /* check boot cpu */
+ if (!uv_cpu_hub_info(0)->scir.enabled)
+ uv_heartbeat_enable(0);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __cpuinit uv_heartbeat_disable(int cpu)
+{
+ if (uv_cpu_hub_info(cpu)->scir.enabled) {
+ uv_cpu_hub_info(cpu)->scir.enabled = 0;
+ del_timer(&uv_cpu_hub_info(cpu)->scir.timer);
+ }
+ uv_set_cpu_scir_bits(cpu, 0xff);
+}
+
+/*
+ * cpu hotplug notifier
+ */
+static __cpuinit int uv_scir_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+
+ switch (action) {
+ case CPU_ONLINE:
+ uv_heartbeat_enable(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ uv_heartbeat_disable(cpu);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+ hotcpu_notifier(uv_scir_cpu_notify, 0);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+
+static __init void uv_scir_register_cpu_notifier(void)
+{
+}
+
+static __init int uv_init_heartbeat(void)
+{
+ int cpu;
+
+ if (is_uv_system())
+ for_each_online_cpu(cpu)
+ uv_heartbeat_enable(cpu);
+ return 0;
+}
+
+late_initcall(uv_init_heartbeat);
+
+#endif /* !CONFIG_HOTPLUG_CPU */
+
/*
* Called on each cpu to initialize the per_cpu UV data area.
* ZZZ hotplug not supported yet
uv_bios_init();
uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
- &uv_coherency_id, &uv_region_size);
+ &sn_coherency_id, &sn_region_size);
uv_rtc_init();
for_each_present_cpu(cpu) {
uv_blade_info[blade].nr_possible_cpus++;
uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base;
- uv_cpu_hub_info(cpu)->lowmem_remap_top =
- lowmem_redir_base + lowmem_redir_size;
+ uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size;
uv_cpu_hub_info(cpu)->m_val = m_val;
uv_cpu_hub_info(cpu)->n_val = m_val;
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
- uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
+ uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
+ uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
map_mmioh_high(max_pnode);
uv_cpu_init();
+ uv_scir_register_cpu_notifier();
+ proc_mkdir("sgi_uv", NULL);
}
* HPET address is set in acpi/boot.c, when an ACPI entry exists
*/
unsigned long hpet_address;
-unsigned long hpet_num_timers;
+#ifdef CONFIG_PCI_MSI
+static unsigned long hpet_num_timers;
+#endif
static void __iomem *hpet_virt_address;
struct hpet_dev {
* Start hpet with the boot cpu mask and make it
* global after the IO_APIC has been initialized.
*/
- hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
+ hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
clockevents_register_device(&hpet_clockevent);
global_clock_event = &hpet_clockevent;
printk(KERN_DEBUG "hpet clockevent registered\n");
struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
hpet_setup_msi_irq(hdev->irq);
disable_irq(hdev->irq);
- irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+ irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
enable_irq(hdev->irq);
}
break;
return -1;
disable_irq(dev->irq);
- irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+ irq_set_affinity(dev->irq, cpumask_of(dev->cpu));
enable_irq(dev->irq);
printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
/* 5 usec minimum reprogramming delta. */
evt->min_delta_ns = 5000;
- evt->cpumask = cpumask_of_cpu(hdev->cpu);
+ evt->cpumask = cpumask_of(hdev->cpu);
clockevents_register_device(evt);
}
out_nohpet:
hpet_clear_mapping();
- boot_hpet_disable = 1;
+ hpet_address = 0;
return 0;
}
hpet_address = force_hpet_address;
hpet_enable();
- if (!hpet_virt_address)
- return -ENODEV;
}
+ if (!hpet_virt_address)
+ return -ENODEV;
+
hpet_reserve_platform_timers(hpet_readl(HPET_ID));
for_each_online_cpu(cpu) {
struct irq_cfg {
struct irq_pin_list *irq_2_pin;
- cpumask_t domain;
- cpumask_t old_domain;
+ cpumask_var_t domain;
+ cpumask_var_t old_domain;
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
#else
static struct irq_cfg irq_cfgx[NR_IRQS] = {
#endif
- [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
- [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
- [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
- [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
- [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
- [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
- [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
- [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
- [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
- [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
- [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
- [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
- [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
- [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
- [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
- [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+ [0] = { .vector = IRQ0_VECTOR, },
+ [1] = { .vector = IRQ1_VECTOR, },
+ [2] = { .vector = IRQ2_VECTOR, },
+ [3] = { .vector = IRQ3_VECTOR, },
+ [4] = { .vector = IRQ4_VECTOR, },
+ [5] = { .vector = IRQ5_VECTOR, },
+ [6] = { .vector = IRQ6_VECTOR, },
+ [7] = { .vector = IRQ7_VECTOR, },
+ [8] = { .vector = IRQ8_VECTOR, },
+ [9] = { .vector = IRQ9_VECTOR, },
+ [10] = { .vector = IRQ10_VECTOR, },
+ [11] = { .vector = IRQ11_VECTOR, },
+ [12] = { .vector = IRQ12_VECTOR, },
+ [13] = { .vector = IRQ13_VECTOR, },
+ [14] = { .vector = IRQ14_VECTOR, },
+ [15] = { .vector = IRQ15_VECTOR, },
};
-void __init arch_early_irq_init(void)
+int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
struct irq_desc *desc;
for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
+ alloc_bootmem_cpumask_var(&cfg[i].domain);
+ alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+ if (i < NR_IRQS_LEGACY)
+ cpumask_setall(cfg[i].domain);
}
+
+ return 0;
}
#ifdef CONFIG_SPARSE_IRQ
node = cpu_to_node(cpu);
cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
+ if (cfg) {
+ /* FIXME: needs alloc_cpumask_var_node() */
+ if (!alloc_cpumask_var(&cfg->domain, GFP_ATOMIC)) {
+ kfree(cfg);
+ cfg = NULL;
+ } else if (!alloc_cpumask_var(&cfg->old_domain, GFP_ATOMIC)) {
+ free_cpumask_var(cfg->domain);
+ kfree(cfg);
+ cfg = NULL;
+ } else {
+ cpumask_clear(cfg->domain);
+ cpumask_clear(cfg->old_domain);
+ }
+ }
printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
return cfg;
}
-void arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
{
struct irq_cfg *cfg;
BUG_ON(1);
}
}
+
+ return 0;
}
#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
}
}
- static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ static void
+ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg = desc->chip_data;
if (!cfg->move_in_progress) {
/* it means that domain is not changed */
- if (!cpus_intersects(desc->affinity, mask))
+ if (!cpumask_intersects(&desc->affinity, mask))
cfg->move_desc_pending = 1;
}
}
#endif
#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
- static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+ static inline void
+ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
{
}
#endif
}
#ifdef CONFIG_SMP
+ static void send_cleanup_vector(struct irq_cfg *cfg)
+ {
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+ cfg->move_cleanup_count = 0;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ cfg->move_cleanup_count++;
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+ send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+ }
+
static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
{
int apic, pin;
}
}
- static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
+ static int
+ assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
- static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ /*
+ * Either sets desc->affinity to a valid value, and returns cpu_mask_to_apicid
+ * of that, or returns BAD_APICID and leaves desc->affinity untouched.
+ */
+ static unsigned int
+ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
- unsigned long flags;
- unsigned int dest;
- cpumask_t tmp;
unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return BAD_APICID;
irq = desc->irq;
cfg = desc->chip_data;
if (assign_irq_vector(irq, cfg, mask))
- return;
+ return BAD_APICID;
+ cpumask_and(&desc->affinity, cfg->domain, mask);
set_extra_move_desc(desc, mask);
+ return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+ }
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
- /*
- * Only the high 8 bits are valid.
- */
- dest = SET_APIC_LOGICAL_ID(dest);
+ static void
+ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+ {
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ unsigned int dest;
+ unsigned int irq;
+
+ irq = desc->irq;
+ cfg = desc->chip_data;
spin_lock_irqsave(&ioapic_lock, flags);
- __target_IO_APIC_irq(irq, dest, cfg);
- desc->affinity = mask;
+ dest = set_desc_affinity(desc, mask);
+ if (dest != BAD_APICID) {
+ /* Only the high 8 bits are valid. */
+ dest = SET_APIC_LOGICAL_ID(dest);
+ __target_IO_APIC_irq(irq, dest, cfg);
+ }
spin_unlock_irqrestore(&ioapic_lock, flags);
}
- static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+ static void
+ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc;
spin_unlock(&vector_lock);
}
- static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+ static int
+ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
/*
* NOTE! The local APIC isn't very good at handling
*/
static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
unsigned int old_vector;
- int cpu;
+ int cpu, err;
+ cpumask_var_t tmp_mask;
if ((cfg->move_in_progress) || cfg->move_cleanup_count)
return -EBUSY;
- /* Only try and allocate irqs on cpus that are present */
- cpus_and(mask, mask, cpu_online_map);
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
old_vector = cfg->vector;
if (old_vector) {
- cpumask_t tmp;
- cpus_and(tmp, cfg->domain, mask);
- if (!cpus_empty(tmp))
+ cpumask_and(tmp_mask, mask, cpu_online_mask);
+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ if (!cpumask_empty(tmp_mask)) {
+ free_cpumask_var(tmp_mask);
return 0;
+ }
}
- for_each_cpu_mask_nr(cpu, mask) {
- cpumask_t domain, new_mask;
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
int new_cpu;
int vector, offset;
- domain = vector_allocation_domain(cpu);
- cpus_and(new_mask, domain, cpu_online_map);
+ vector_allocation_domain(cpu, tmp_mask);
vector = current_vector;
offset = current_offset;
next:
vector += 8;
if (vector >= first_system_vector) {
- /* If we run out of vectors on large boxen, must share them. */
+ /* If out of vectors on large boxen, must share them. */
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
}
if (unlikely(current_vector == vector))
continue;
- #ifdef CONFIG_X86_64
- if (vector == IA32_SYSCALL_VECTOR)
- goto next;
- #else
- if (vector == SYSCALL_VECTOR)
+
+ if (test_bit(vector, used_vectors))
goto next;
- #endif
- for_each_cpu_mask_nr(new_cpu, new_mask)
+
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
if (per_cpu(vector_irq, new_cpu)[vector] != -1)
goto next;
/* Found one! */
current_offset = offset;
if (old_vector) {
cfg->move_in_progress = 1;
- cfg->old_domain = cfg->domain;
+ cpumask_copy(cfg->old_domain, cfg->domain);
}
- for_each_cpu_mask_nr(new_cpu, new_mask)
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
per_cpu(vector_irq, new_cpu)[vector] = irq;
cfg->vector = vector;
- cfg->domain = domain;
- return 0;
+ cpumask_copy(cfg->domain, tmp_mask);
+ err = 0;
+ break;
}
- return -ENOSPC;
+ free_cpumask_var(tmp_mask);
+ return err;
}
- static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
+ static int
+ assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
unsigned long flags;
static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
{
- cpumask_t mask;
int cpu, vector;
BUG_ON(!cfg->vector);
vector = cfg->vector;
- cpus_and(mask, cfg->domain, cpu_online_map);
- for_each_cpu_mask_nr(cpu, mask)
+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
- cpus_clear(cfg->domain);
+ cpumask_clear(cfg->domain);
if (likely(!cfg->move_in_progress))
return;
- cpus_and(mask, cfg->old_domain, cpu_online_map);
- for_each_cpu_mask_nr(cpu, mask) {
+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
vector++) {
if (per_cpu(vector_irq, cpu)[vector] != irq)
/* Mark the inuse vectors */
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
cfg = desc->chip_data;
- if (!cpu_isset(cpu, cfg->domain))
+ if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
vector = cfg->vector;
per_cpu(vector_irq, cpu)[vector] = irq;
continue;
cfg = irq_cfg(irq);
- if (!cpu_isset(cpu, cfg->domain))
+ if (!cpumask_test_cpu(cpu, cfg->domain))
per_cpu(vector_irq, cpu)[vector] = -1;
}
}
{
struct irq_cfg *cfg;
struct IO_APIC_route_entry entry;
- cpumask_t mask;
+ unsigned int dest;
if (!IO_APIC_IRQ(irq))
return;
cfg = desc->chip_data;
- mask = TARGET_CPUS;
- if (assign_irq_vector(irq, cfg, mask))
+ if (assign_irq_vector(irq, cfg, TARGET_CPUS))
return;
- cpus_and(mask, cfg->domain, mask);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
apic_printk(APIC_VERBOSE,KERN_DEBUG
"IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
- cpu_mask_to_apicid(mask), trigger, polarity,
- cfg->vector)) {
+ dest, trigger, polarity, cfg->vector)) {
printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mp_ioapics[apic].mp_apicid, pin);
__clear_irq_vector(irq, cfg);
for_each_irq_desc(irq, desc) {
struct irq_pin_list *entry;
- if (!desc)
- continue;
cfg = desc->chip_data;
entry = cfg->irq_2_pin;
if (!entry)
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
+ send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
return 1;
* as simple as edge triggered migration and we can do the irq migration
* with a simple atomic update to IO-APIC RTE.
*/
- static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ static void
+ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
{
struct irq_cfg *cfg;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
int modify_ioapic_rte;
unsigned int dest;
unsigned long flags;
unsigned int irq;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ if (!cpumask_intersects(mask, cpu_online_mask))
return;
irq = desc->irq;
set_extra_move_desc(desc, mask);
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
+ dest = cpu_mask_to_apicid_and(cfg->domain, mask);
modify_ioapic_rte = desc->status & IRQ_LEVEL;
if (modify_ioapic_rte) {
*/
modify_irte(irq, &irte);
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
- desc->affinity = mask;
+ cpumask_copy(&desc->affinity, mask);
}
static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
}
/* everthing is clear. we have right of way */
- migrate_ioapic_irq_desc(desc, desc->pending_mask);
+ migrate_ioapic_irq_desc(desc, &desc->pending_mask);
ret = 0;
desc->status &= ~IRQ_MOVE_PENDING;
- cpus_clear(desc->pending_mask);
+ cpumask_clear(&desc->pending_mask);
unmask:
unmask_IO_APIC_irq_desc(desc);
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
if (desc->status & IRQ_MOVE_PENDING) {
unsigned long flags;
continue;
}
- desc->chip->set_affinity(irq, desc->pending_mask);
+ desc->chip->set_affinity(irq, &desc->pending_mask);
spin_unlock_irqrestore(&desc->lock, flags);
}
}
/*
* Migrates the IRQ destination in the process context.
*/
- static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, cpumask_t mask)
+ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+ const struct cpumask *mask)
{
if (desc->status & IRQ_LEVEL) {
desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = mask;
+ cpumask_copy(&desc->pending_mask, mask);
migrate_irq_remapped_level_desc(desc);
return;
}
migrate_ioapic_irq_desc(desc, mask);
}
- static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+ static void set_ir_ioapic_affinity_irq(unsigned int irq,
+ const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
asmlinkage void smp_irq_move_cleanup_interrupt(void)
{
unsigned vector, me;
+
ack_APIC_irq();
-#ifdef CONFIG_X86_64
exit_idle();
-#endif
irq_enter();
me = smp_processor_id();
if (!cfg->move_cleanup_count)
goto unlock;
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain))
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
goto unlock;
__get_cpu_var(vector_irq)[vector] = -1;
if (likely(!cfg->move_desc_pending))
return;
- /* domain is not change, but affinity is changed */
+ /* domain has not changed, but affinity did */
me = smp_processor_id();
if (cpu_isset(me, desc->affinity)) {
*descp = desc = move_irq_desc(desc, me);
vector = ~get_irq_regs()->orig_ax;
me = smp_processor_id();
- if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
- cpumask_t cleanup_mask;
-
#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
*descp = desc = move_irq_desc(desc, me);
/* get the new one */
cfg = desc->chip_data;
#endif
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ send_cleanup_vector(cfg);
}
#else
static inline void irq_complete_move(struct irq_desc **descp) {}
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
cfg = desc->chip_data;
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
struct irq_cfg *cfg;
int err;
unsigned dest;
- cpumask_t tmp;
cfg = irq_cfg(irq);
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, cfg, tmp);
+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
if (err)
return err;
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
#ifdef CONFIG_INTR_REMAP
if (irq_remapped(irq)) {
}
#ifdef CONFIG_SMP
- static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return;
-
- set_extra_move_desc(desc, mask);
-
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
read_msi_msg_desc(desc, &msg);
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg_desc(desc, &msg);
- desc->affinity = mask;
}
#ifdef CONFIG_INTR_REMAP
/*
* Migrate the MSI irq to another cpumask. This migration is
* done in the process context using interrupt-remapping hardware.
*/
- static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+ static void
+ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irq_cfg *cfg;
+ struct irq_cfg *cfg = desc->chip_data;
unsigned int dest;
- cpumask_t tmp, cleanup_mask;
struct irte irte;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
- return;
-
if (get_irte(irq, &irte))
return;
- cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
- set_extra_move_desc(desc, mask);
-
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
-
irte.vector = cfg->vector;
irte.dest_id = IRTE_DEST(dest);
* at the new destination. So, time to cleanup the previous
* vector allocation.
*/
- if (cfg->move_in_progress) {
- cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
- cfg->move_cleanup_count = cpus_weight(cleanup_mask);
- send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- cfg->move_in_progress = 0;
- }
-
- desc->affinity = mask;
+ if (cfg->move_in_progress)
+ send_cleanup_vector(cfg);
}
#endif
#ifdef CONFIG_DMAR
#ifdef CONFIG_SMP
- static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
+ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return;
-
- set_extra_move_desc(desc, mask);
-
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
dmar_msi_read(irq, &msg);
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
dmar_msi_write(irq, &msg);
- desc->affinity = mask;
}
#endif /* CONFIG_SMP */
#ifdef CONFIG_HPET_TIMER
#ifdef CONFIG_SMP
- static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
struct msi_msg msg;
unsigned int dest;
- cpumask_t tmp;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return;
-
- set_extra_move_desc(desc, mask);
-
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
hpet_msi_read(irq, &msg);
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
hpet_msi_write(irq, &msg);
- desc->affinity = mask;
}
#endif /* CONFIG_SMP */
write_ht_irq_msg(irq, &msg);
}
- static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
+ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
struct irq_cfg *cfg;
unsigned int dest;
- cpumask_t tmp;
- cpus_and(tmp, mask, cpu_online_map);
- if (cpus_empty(tmp))
+ dest = set_desc_affinity(desc, mask);
+ if (dest == BAD_APICID)
return;
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return;
-
- set_extra_move_desc(desc, mask);
-
- cpus_and(tmp, cfg->domain, mask);
- dest = cpu_mask_to_apicid(tmp);
target_ht_irq(irq, dest, cfg->vector);
- desc->affinity = mask;
}
#endif
{
struct irq_cfg *cfg;
int err;
- cpumask_t tmp;
cfg = irq_cfg(irq);
- tmp = TARGET_CPUS;
- err = assign_irq_vector(irq, cfg, tmp);
+ err = assign_irq_vector(irq, cfg, TARGET_CPUS);
if (!err) {
struct ht_irq_msg msg;
unsigned dest;
- cpus_and(tmp, cfg->domain, tmp);
- dest = cpu_mask_to_apicid(tmp);
+ dest = cpu_mask_to_apicid_and(cfg->domain, TARGET_CPUS);
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long mmr_offset)
{
- const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+ const struct cpumask *eligible_cpu = cpumask_of(cpu);
struct irq_cfg *cfg;
int mmr_pnode;
unsigned long mmr_value;
cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, *eligible_cpu);
+ err = assign_irq_vector(irq, cfg, eligible_cpu);
if (err != 0)
return err;
entry->polarity = 0;
entry->trigger = 0;
entry->mask = 0;
- entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+ entry->dest = cpu_mask_to_apicid(eligible_cpu);
mmr_pnode = uv_blade_to_pnode(mmr_blade);
uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
int pin, ioapic, irq, irq_entry;
struct irq_desc *desc;
struct irq_cfg *cfg;
- cpumask_t mask;
+ const struct cpumask *mask;
if (skip_ioapic_setup == 1)
return;
*/
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
- mask = desc->affinity;
+ mask = &desc->affinity;
else
mask = TARGET_CPUS;
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/delay.h>
+#include <linux/ftrace.h>
#include <asm/uaccess.h>
#include <asm/io_apic.h>
#include <asm/idle.h>
#include <asm/smp.h>
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
/*
* Probabilistic stack overflow check:
*
*/
static inline void stack_overflow_check(struct pt_regs *regs)
{
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
u64 curbase = (u64)task_stack_page(current);
- static unsigned long warned = -60*HZ;
-
- if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE &&
- regs->sp < curbase + sizeof(struct thread_info) + 128 &&
- time_after(jiffies, warned + 60*HZ)) {
- printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
- current->comm, curbase, regs->sp);
- show_stack(NULL,NULL);
- warned = jiffies;
- }
-}
+
+ WARN_ONCE(regs->sp >= curbase &&
+ regs->sp <= curbase + THREAD_SIZE &&
+ regs->sp < curbase + sizeof(struct thread_info) +
+ sizeof(struct pt_regs) + 128,
+
+ "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
+ current->comm, curbase, regs->sp);
#endif
+}
/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
-asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
+asmlinkage unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
irq_enter();
irq = __get_cpu_var(vector_irq)[vector];
-#ifdef CONFIG_DEBUG_STACKOVERFLOW
stack_overflow_check(regs);
-#endif
desc = irq_to_desc(irq);
if (likely(desc))
}
#ifdef CONFIG_HOTPLUG_CPU
- void fixup_irqs(cpumask_t map)
+ /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
+ void fixup_irqs(void)
{
unsigned int irq;
static int warned;
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- cpumask_t mask;
int break_affinity = 0;
int set_affinity = 1;
+ const struct cpumask *affinity;
if (!desc)
continue;
/* interrupt's are disabled at this point */
spin_lock(&desc->lock);
+ affinity = &desc->affinity;
if (!irq_has_action(irq) ||
- cpus_equal(desc->affinity, map)) {
+ cpumask_equal(affinity, cpu_online_mask)) {
spin_unlock(&desc->lock);
continue;
}
- cpus_and(mask, desc->affinity, map);
- if (cpus_empty(mask)) {
+ if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
break_affinity = 1;
- mask = map;
+ affinity = cpu_all_mask;
}
if (desc->chip->mask)
desc->chip->mask(irq);
if (desc->chip->set_affinity)
- desc->chip->set_affinity(irq, mask);
+ desc->chip->set_affinity(irq, affinity);
else if (!(warned++))
set_affinity = 0;
[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
};
+ int vector_used_by_percpu_irq(unsigned int vector)
+ {
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (per_cpu(vector_irq, cpu)[vector] != -1)
+ return 1;
+ }
+
+ return 0;
+ }
+
/* Overridden in paravirt.c */
void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* SYSCALL_VECTOR was reserved in trap_init. */
if (i != SYSCALL_VECTOR)
- set_intr_gate(i, interrupt[i]);
+ set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
}
alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
/* IPI for single call function */
- set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+ call_function_single_interrupt);
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
#include <asm/apic.h>
#include <asm/i8259.h>
-/*
- * Common place to define all x86 IRQ vectors
- *
- * This builds up the IRQ handler stubs using some ugly macros in irq.h
- *
- * These macros create the low-level assembly IRQ routines that save
- * register context and call do_IRQ(). do_IRQ() then does all the
- * operations that are needed to keep the AT (or SMP IOAPIC)
- * interrupt-controller happy.
- */
-
-#define IRQ_NAME2(nr) nr##_interrupt(void)
-#define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr)
-
-/*
- * SMP has a few special interrupts for IPI messages
- */
-
-#define BUILD_IRQ(nr) \
- asmlinkage void IRQ_NAME(nr); \
- asm("\n.text\n.p2align\n" \
- "IRQ" #nr "_interrupt:\n\t" \
- "push $~(" #nr ") ; " \
- "jmp common_interrupt\n" \
- ".previous");
-
-#define BI(x,y) \
- BUILD_IRQ(x##y)
-
-#define BUILD_16_IRQS(x) \
- BI(x,0) BI(x,1) BI(x,2) BI(x,3) \
- BI(x,4) BI(x,5) BI(x,6) BI(x,7) \
- BI(x,8) BI(x,9) BI(x,a) BI(x,b) \
- BI(x,c) BI(x,d) BI(x,e) BI(x,f)
-
/*
* ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
* (these are usually mapped to vectors 0x30-0x3f)
*
* (these are usually mapped into the 0x30-0xff vector range)
*/
- BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3)
-BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7)
-BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb)
-BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf)
-
-#undef BUILD_16_IRQS
-#undef BI
-
-
-#define IRQ(x,y) \
- IRQ##x##y##_interrupt
-
-#define IRQLIST_16(x) \
- IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \
- IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \
- IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \
- IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f)
-
-/* for the irq vectors */
-static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = {
- IRQLIST_16(0x2), IRQLIST_16(0x3),
- IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7),
- IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb),
- IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf)
-};
-
-#undef IRQ
-#undef IRQLIST_16
-
-
-
/*
* IRQ2 is cascade interrupt to second interrupt controller
[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
};
+ int vector_used_by_percpu_irq(unsigned int vector)
+ {
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ if (per_cpu(vector_irq, cpu)[vector] != -1)
+ return 1;
+ }
+
+ return 0;
+ }
+
void __init init_ISA_irqs(void)
{
int i;
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
#endif
}
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
+#include <asm/virtext.h>
#ifdef CONFIG_X86_32
# include <linux/dmi.h>
static int reboot_cpu = -1;
#endif
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
}
}
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+ cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+ /* Just make sure we won't change CPUs while doing this */
+ local_irq_disable();
+
+ /* We need to disable VMX on all CPUs before rebooting, otherwise
+ * we risk hanging up the machine, because the CPU ignore INIT
+ * signals when VMX is enabled.
+ *
+ * We can't take any locks and we may be on an inconsistent
+ * state, so we use NMIs as IPIs to tell the other CPUs to disable
+ * VMX and halt.
+ *
+ * For safety, we will avoid running the nmi_shootdown_cpus()
+ * stuff unnecessarily, but we don't have a way to check
+ * if other CPUs have VMX enabled. So we will call it only if the
+ * CPU we are running on has VMX enabled.
+ *
+ * We will miss cases where VMX is not enabled on all CPUs. This
+ * shouldn't do much harm because KVM always enable VMX on all
+ * CPUs anyway. But we can miss it on the small window where KVM
+ * is still enabling VMX.
+ */
+ if (cpu_has_vmx() && cpu_vmx_enabled()) {
+ /* Disable VMX on this CPU.
+ */
+ cpu_vmxoff();
+
+ /* Halt and disable VMX on the other CPUs */
+ nmi_shootdown_cpus(vmxoff_nmi);
+
+ }
+}
+
+
void __attribute__((weak)) mach_reboot_fixups(void)
{
}
{
int i;
+ if (reboot_emergency)
+ emergency_vmx_disable_all();
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
#endif
}
+static void __machine_emergency_restart(int emergency)
+{
+ reboot_emergency = emergency;
+ machine_ops.emergency_restart();
+}
+
static void native_machine_restart(char *__unused)
{
printk("machine restart\n");
if (!reboot_force)
machine_shutdown();
- machine_emergency_restart();
+ __machine_emergency_restart(0);
}
static void native_machine_halt(void)
void machine_emergency_restart(void)
{
- machine_ops.emergency_restart();
+ __machine_emergency_restart(1);
}
void machine_restart(char *cmd)
static void smp_send_nmi_allbutself(void)
{
- cpumask_t mask = cpu_online_map;
- cpu_clear(safe_smp_processor_id(), mask);
- if (!cpus_empty(mask))
- send_IPI_mask(mask, NMI_VECTOR);
+ send_IPI_allbutself(NMI_VECTOR);
}
static struct notifier_block crash_nmi_nb = {
WARN_ON(1);
return;
}
- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+ send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
}
void native_send_call_func_single_ipi(int cpu)
{
- send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
+ send_IPI_mask(cpumask_of(cpu), CALL_FUNCTION_SINGLE_VECTOR);
}
- void native_send_call_func_ipi(cpumask_t mask)
+ void native_send_call_func_ipi(const struct cpumask *mask)
{
cpumask_t allbutself;
allbutself = cpu_online_map;
cpu_clear(smp_processor_id(), allbutself);
- if (cpus_equal(mask, allbutself) &&
+ if (cpus_equal(*mask, allbutself) &&
cpus_equal(cpu_online_map, cpu_callout_map))
send_IPI_allbutself(CALL_FUNCTION_VECTOR);
else
void smp_reschedule_interrupt(struct pt_regs *regs)
{
ack_APIC_irq();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_resched_count++;
-#else
- add_pda(irq_resched_count, 1);
-#endif
+ inc_irq_stat(irq_resched_count);
}
void smp_call_function_interrupt(struct pt_regs *regs)
ack_APIC_irq();
irq_enter();
generic_smp_call_function_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
}
ack_APIC_irq();
irq_enter();
generic_smp_call_function_single_interrupt();
-#ifdef CONFIG_X86_32
- __get_cpu_var(irq_stat).irq_call_count++;
-#else
- add_pda(irq_call_count, 1);
-#endif
+ inc_irq_stat(irq_call_count);
irq_exit();
}
/* Last level cache ID of each logical CPU */
DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
- /* bitmap of online cpus */
- cpumask_t cpu_online_map __read_mostly;
- EXPORT_SYMBOL(cpu_online_map);
-
cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map;
- cpumask_t cpu_possible_map;
- EXPORT_SYMBOL(cpu_possible_map);
/* representing HT siblings of each logical CPU */
DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
/*
* Activate a secondary processor.
*/
-static void __cpuinit start_secondary(void *unused)
+notrace static void __cpuinit start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
#endif
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
- "by the BIOS.\n", hard_smp_processor_id());
+ printk(KERN_WARNING
+ "weird, boot CPU (#%d) not listed by the BIOS.\n",
+ hard_smp_processor_id());
+
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
}
check_nmi_watchdog();
}
+ static int __initdata setup_possible_cpus = -1;
+ static int __init _setup_possible_cpus(char *str)
+ {
+ get_option(&str, &setup_possible_cpus);
+ return 0;
+ }
+ early_param("possible_cpus", _setup_possible_cpus);
+
+
/*
* cpu_possible_map should be static, it cannot change as cpu's
* are onlined, or offlined. The reason is per-cpu data-structures
*
* Three ways to find out the number of additional hotplug CPUs:
* - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
+ * - The user can overwrite it with possible_cpus=NUM
* - Otherwise don't reserve additional CPUs.
* We do this because additional CPUs waste a lot of memory.
* -AK
if (!num_processors)
num_processors = 1;
- possible = num_processors + disabled_cpus;
- if (possible > NR_CPUS)
- possible = NR_CPUS;
+ if (setup_possible_cpus == -1)
+ possible = num_processors + disabled_cpus;
+ else
+ possible = setup_possible_cpus;
+
+ if (possible > CONFIG_NR_CPUS) {
+ printk(KERN_WARNING
+ "%d Processors exceeds NR_CPUS limit of %d\n",
+ possible, CONFIG_NR_CPUS);
+ possible = CONFIG_NR_CPUS;
+ }
printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
possible, max_t(int, possible - num_processors, 0));
lock_vector_lock();
remove_cpu_from_maps(cpu);
unlock_vector_lock();
- fixup_irqs(cpu_online_map);
+ fixup_irqs();
}
int native_cpu_disable(void)
*/
void leave_mm(int cpu)
{
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- BUG();
- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+ BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
+ cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
load_cr3(swapper_pg_dir);
}
EXPORT_SYMBOL_GPL(leave_mm);
* BUG();
*/
- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+ if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
if (flush_va == TLB_FLUSH_ALL)
local_flush_tlb();
else
smp_mb__after_clear_bit();
out:
put_cpu_no_resched();
- __get_cpu_var(irq_stat).irq_tlb_count++;
+ inc_irq_stat(irq_tlb_count);
}
void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
* We have to send the IPI only to
* CPUs affected.
*/
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+ send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
while (!cpus_empty(flush_cpumask))
/* nothing. lockup detection does not belong here */
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+ if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
out:
ack_APIC_irq();
cpu_clear(cpu, f->flush_cpumask);
- add_pda(irq_tlb_count, 1);
+ inc_irq_stat(irq_tlb_count);
}
void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
* We have to send the IPI only to
* CPUs affected.
*/
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
+ send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
while (!cpus_empty(f->flush_cpumask))
cpu_relax();
#include "cpu/mcheck/mce.h"
- DECLARE_BITMAP(used_vectors, NR_VECTORS);
- EXPORT_SYMBOL_GPL(used_vectors);
-
asmlinkage int system_call(void);
/* Do we ignore FPU interrupts ? */
__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
#endif
+ DECLARE_BITMAP(used_vectors, NR_VECTORS);
+ EXPORT_SYMBOL_GPL(used_vectors);
+
static int ignore_nmis;
static inline void conditional_sti(struct pt_regs *regs)
{
nmi_enter();
-#ifdef CONFIG_X86_32
- { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
-#else
- add_pda(__nmi_count, 1);
-#endif
+ inc_irq_stat(__nmi_count);
if (!ignore_nmis)
default_do_nmi(regs);
{
struct task_struct *task;
siginfo_t info;
- unsigned short cwd, swd;
+ unsigned short cwd, swd, err;
/*
* Save the info for the exception handler and clear the error.
task->thread.error_code = 0;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_code = __SI_FAULT;
info.si_addr = ip;
/*
* (~cwd & swd) will mask out exceptions that are not set to unmasked
*/
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
- switch (swd & ~cwd & 0x3f) {
- case 0x000: /* No unmasked exception */
+
+ err = swd & ~cwd & 0x3f;
+
#ifdef CONFIG_X86_32
+ if (!err)
return;
#endif
- default: /* Multiple exceptions */
- break;
- case 0x001: /* Invalid Op */
+
+ if (err & 0x001) { /* Invalid op */
/*
* swd & 0x240 == 0x040: Stack Underflow
* swd & 0x240 == 0x240: Stack Overflow
* User must clear the SF bit (0x40) if set
*/
info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
+ } else if (err & 0x004) { /* Divide by Zero */
info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
+ } else if (err & 0x008) { /* Overflow */
info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
+ } else if (err & 0x012) { /* Denormal, Underflow */
+ info.si_code = FPE_FLTUND;
+ } else if (err & 0x020) { /* Precision */
info.si_code = FPE_FLTRES;
- break;
+ } else {
+ info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
}
force_sig_info(SIGFPE, &info, task);
}
void __init trap_init(void)
{
- #ifdef CONFIG_X86_32
int i;
- #endif
#ifdef CONFIG_EISA
void __iomem *p = early_ioremap(0x0FFFD9, 4);
}
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
+ #endif
/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
set_bit(i, used_vectors);
+ #ifdef CONFIG_X86_64
+ set_bit(IA32_SYSCALL_VECTOR, used_vectors);
+ #else
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
/*
* a straightforward 1 to 1 mapping, so force that here. */
__get_cpu_var(vector_irq)[vector] = i;
if (vector != SYSCALL_VECTOR) {
- set_intr_gate(vector, interrupt[vector]);
+ set_intr_gate(vector,
+ interrupt[vector-FIRST_EXTERNAL_VECTOR]);
set_irq_chip_and_handler_name(i, &lguest_irq_controller,
handle_level_irq,
"level");
/* We can't set cpumask in the initializer: damn C limitations! Set it
* here and register our timer device. */
- lguest_clockevent.cpumask = cpumask_of_cpu(0);
+ lguest_clockevent.cpumask = cpumask_of(0);
clockevents_register_device(&lguest_clockevent);
/* Finally, we unblock the timer interrupt. */
{
unsigned pfn, idx;
- for(pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
+ for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
}
- for(idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
+ for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
}
unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
unsigned pfn;
- for(pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
unsigned topidx = p2m_top_index(pfn);
p2m_top[topidx] = &mfn_list[pfn];
p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
BUG_ON(p == NULL);
- for(i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
+ for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
p[i] = INVALID_P2M_ENTRY;
if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
preempt_enable();
}
-pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
return *ptep;
if (user_pgd) {
xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
- xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+ xen_do_pin(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
}
}
#else /* CONFIG_X86_32 */
pgd_t *user_pgd = xen_get_user_pgd(pgd);
if (user_pgd) {
- xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+ xen_do_pin(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(user_pgd)));
xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
}
}
static void xen_drop_mm_ref(struct mm_struct *mm)
{
- cpumask_t mask;
+ cpumask_var_t mask;
unsigned cpu;
if (current->active_mm == mm) {
}
/* Get the "official" set of cpus referring to our pagetable. */
- mask = mm->cpu_vm_mask;
+ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
+ for_each_online_cpu(cpu) {
+ if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask)
+ && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
+ continue;
+ smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+ }
+ return;
+ }
+ cpumask_copy(mask, &mm->cpu_vm_mask);
/* It's possible that a vcpu may have a stale reference to our
cr3, because its in lazy mode, and it hasn't yet flushed
if needed. */
for_each_online_cpu(cpu) {
if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
- cpu_set(cpu, mask);
+ cpumask_set_cpu(cpu, mask);
}
- if (!cpus_empty(mask))
- smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+ if (!cpumask_empty(mask))
+ smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
+ free_cpumask_var(mask);
}
#else
static void xen_drop_mm_ref(struct mm_struct *mm)
/* By default all event channels notify CPU#0. */
for_each_irq_desc(i, desc) {
- if (!desc)
- continue;
-
desc->affinity = cpumask_of_cpu(0);
}
#endif
static int find_unbound_irq(void)
{
int irq;
+ struct irq_desc *desc;
/* Only allocate from dynirq range */
for (irq = 0; irq < nr_irqs; irq++)
if (irq == nr_irqs)
panic("No available IRQ to bind to: increase nr_irqs!\n");
+ desc = irq_to_desc_alloc_cpu(irq, 0);
+ if (WARN_ON(desc == NULL))
+ return -1;
+
return irq;
}
spin_unlock(&irq_mapping_update_lock);
/* new event channels are always bound to cpu 0 */
- irq_set_affinity(irq, cpumask_of_cpu(0));
+ irq_set_affinity(irq, cpumask_of(0));
/* Unmask the event channel. */
enable_irq(irq);
}
- static void set_affinity_irq(unsigned irq, cpumask_t dest)
+ static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
{
- unsigned tcpu = first_cpu(dest);
+ unsigned tcpu = cpumask_first(dest);
rebind_irq_to_cpu(irq, tcpu);
}
extern cpumask_t irq_default_affinity;
- extern int irq_set_affinity(unsigned int irq, cpumask_t cpumask);
+ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);
#else /* CONFIG_SMP */
- static inline int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+ static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
{
return -EINVAL;
}
BLOCK_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
-#ifdef CONFIG_HIGH_RES_TIMERS
- HRTIMER_SOFTIRQ,
-#endif
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
int show_interrupts(struct seq_file *p, void *v);
+struct irq_desc;
+
+extern int early_irq_init(void);
+extern int arch_early_irq_init(void);
+extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
+
#endif
void (*eoi)(unsigned int irq);
void (*end)(unsigned int irq);
- void (*set_affinity)(unsigned int irq, cpumask_t dest);
+ void (*set_affinity)(unsigned int irq,
+ const struct cpumask *dest);
int (*retrigger)(unsigned int irq);
int (*set_type)(unsigned int irq, unsigned int flow_type);
int (*set_wake)(unsigned int irq, unsigned int on);
/**
* struct irq_desc - interrupt descriptor
* @irq: interrupt number for this descriptor
+ * @timer_rand_state: pointer to timer rand state struct
+ * @kstat_irqs: irq stats per cpu
+ * @irq_2_iommu: iommu with this irq
* @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()]
* @chip: low level interrupt hardware access
* @msi_desc: MSI descriptor
* @depth: disable-depth, for nested irq_disable() calls
* @wake_depth: enable depth, for multiple set_irq_wake() callers
* @irq_count: stats field to detect stalled irqs
- * @irqs_unhandled: stats field for spurious unhandled interrupts
* @last_unhandled: aging timer for unhandled count
+ * @irqs_unhandled: stats field for spurious unhandled interrupts
* @lock: locking for SMP
* @affinity: IRQ affinity on SMP
* @cpu: cpu index useful for balancing
unsigned int depth; /* nested irq disables */
unsigned int wake_depth; /* nested wake enables */
unsigned int irq_count; /* For detecting broken IRQs */
- unsigned int irqs_unhandled;
unsigned long last_unhandled; /* Aging timer for unhandled count */
+ unsigned int irqs_unhandled;
spinlock_t lock;
#ifdef CONFIG_SMP
cpumask_t affinity;
const char *name;
} ____cacheline_internodealigned_in_smp;
-extern void early_irq_init(void);
-extern void arch_early_irq_init(void);
-extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
struct irq_desc *desc, int cpu);
extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
#ifndef CONFIG_SPARSE_IRQ
extern struct irq_desc irq_desc[NR_IRQS];
-
-static inline struct irq_desc *irq_to_desc(unsigned int irq)
-{
- return (irq < NR_IRQS) ? irq_desc + irq : NULL;
-}
-static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
-{
- return irq_to_desc(irq);
-}
-
-#else
-
-extern struct irq_desc *irq_to_desc(unsigned int irq);
-extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+#else /* CONFIG_SPARSE_IRQ */
extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
-# define for_each_irq_desc(irq, desc) \
- for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
-# define for_each_irq_desc_reverse(irq, desc) \
- for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
-
#define kstat_irqs_this_cpu(DESC) \
((DESC)->kstat_irqs[smp_processor_id()])
#define kstat_incr_irqs_this_cpu(irqno, DESC) \
((DESC)->kstat_irqs[smp_processor_id()]++)
-#endif
+#endif /* CONFIG_SPARSE_IRQ */
+
+extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
static inline struct irq_desc *
irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
extern int runqueue_is_locked(void);
extern void task_rq_unlock_wait(struct task_struct *p);
- extern cpumask_t nohz_cpu_mask;
+ extern cpumask_var_t nohz_cpu_mask;
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
extern int select_nohz_load_balancer(int cpu);
#else
*/
struct rlimit rlim[RLIM_NLIMITS];
- /* keep the process-shared keyrings here so that they do the right
- * thing in threads created with CLONE_THREAD */
-#ifdef CONFIG_KEYS
- struct key *session_keyring; /* keyring inherited over fork */
- struct key *process_keyring; /* keyring private to this process */
-#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
struct pacct_struct pacct; /* per-process accounting information */
#endif
/* Hash table maintenance information */
struct hlist_node uidhash_node;
uid_t uid;
+ struct user_namespace *user_ns;
#ifdef CONFIG_USER_SCHED
struct task_group *tg;
extern struct user_struct root_user;
#define INIT_USER (&root_user)
+
struct backing_dev_info;
struct reclaim_state;
struct sched_info {
/* cumulative counters */
unsigned long pcount; /* # of times run on this cpu */
- unsigned long long cpu_time, /* time spent on the cpu */
- run_delay; /* time spent waiting on a runqueue */
+ unsigned long long run_delay; /* time spent waiting on a runqueue */
/* timestamps */
unsigned long long last_arrival,/* when we last ran on a cpu */
#define SD_SERIALIZE 1024 /* Only a single load balancing instance */
#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */
- #define BALANCE_FOR_MC_POWER \
- (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+ enum powersavings_balance_level {
+ POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
+ POWERSAVINGS_BALANCE_BASIC, /* Fill one thread/core/package
+ * first for long running threads
+ */
+ POWERSAVINGS_BALANCE_WAKEUP, /* Also bias task wakeups to semi-idle
+ * cpu package for power savings
+ */
+ MAX_POWERSAVINGS_BALANCE_LEVELS
+ };
- #define BALANCE_FOR_PKG_POWER \
- ((sched_mc_power_savings || sched_smt_power_savings) ? \
- SD_POWERSAVINGS_BALANCE : 0)
+ extern int sched_mc_power_savings, sched_smt_power_savings;
- #define test_sd_parent(sd, flag) ((sd->parent && \
- (sd->parent->flags & flag)) ? 1 : 0)
+ static inline int sd_balance_for_mc_power(void)
+ {
+ if (sched_smt_power_savings)
+ return SD_POWERSAVINGS_BALANCE;
+ return 0;
+ }
+
+ static inline int sd_balance_for_package_power(void)
+ {
+ if (sched_mc_power_savings | sched_smt_power_savings)
+ return SD_POWERSAVINGS_BALANCE;
+
+ return 0;
+ }
+
+ /*
+ * Optimise SD flags for power savings:
+ * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+ * Keep default SD flags if sched_{smt,mc}_power_saving=0
+ */
+
+ static inline int sd_power_saving_flags(void)
+ {
+ if (sched_mc_power_savings | sched_smt_power_savings)
+ return SD_BALANCE_NEWIDLE;
+
+ return 0;
+ }
struct sched_group {
struct sched_group *next; /* Must be a circular list */
- cpumask_t cpumask;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* (see include/linux/reciprocal_div.h)
*/
u32 reciprocal_cpu_power;
+
+ unsigned long cpumask[];
};
+ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+ {
+ return to_cpumask(sg->cpumask);
+ }
+
enum sched_domain_level {
SD_LV_NONE = 0,
SD_LV_SIBLING,
struct sched_domain *parent; /* top domain must be null terminated */
struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */
- cpumask_t span; /* span of all CPUs in this domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
+
+ /* span of all CPUs in this domain */
+ unsigned long span[];
};
- extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+ {
+ return to_cpumask(sd->span);
+ }
+
+ extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
struct sched_domain_attr *dattr_new);
extern int arch_reinit_sched_domains(void);
+ /* Test a flag in parent sched domain */
+ static inline int test_sd_parent(struct sched_domain *sd, int flag)
+ {
+ if (sd->parent && (sd->parent->flags & flag))
+ return 1;
+
+ return 0;
+ }
+
#else /* CONFIG_SMP */
struct sched_domain_attr;
static inline void
- partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
struct sched_domain_attr *dattr_new)
{
}
#endif /* !CONFIG_SMP */
struct io_context; /* See blkdev.h */
-#define NGROUPS_SMALL 32
-#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(gid_t)))
-struct group_info {
- int ngroups;
- atomic_t usage;
- gid_t small_block[NGROUPS_SMALL];
- int nblocks;
- gid_t *blocks[0];
-};
-/*
- * get_group_info() must be called with the owning task locked (via task_lock())
- * when task != current. The reason being that the vast majority of callers are
- * looking at current->group_info, which can not be changed except by the
- * current task. Changing current->group_info requires the task lock, too.
- */
-#define get_group_info(group_info) do { \
- atomic_inc(&(group_info)->usage); \
-} while (0)
-
-#define put_group_info(group_info) do { \
- if (atomic_dec_and_test(&(group_info)->usage)) \
- groups_free(group_info); \
-} while (0)
-
-extern struct group_info *groups_alloc(int gidsetsize);
-extern void groups_free(struct group_info *group_info);
-extern int set_current_groups(struct group_info *group_info);
-extern int groups_search(struct group_info *group_info, gid_t grp);
-/* access the groups "array" with this macro */
-#define GROUP_AT(gi, i) \
- ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
extern void prefetch_stack(struct task_struct *t);
void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
- const cpumask_t *newmask);
+ const struct cpumask *newmask);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
* The buffer to hold the BTS data.
*/
void *bts_buffer;
+ size_t bts_size;
#endif /* CONFIG_X86_PTRACE_BTS */
/* PID/PID hash table linkage. */
struct list_head cpu_timers[3];
/* process credentials */
- uid_t uid,euid,suid,fsuid;
- gid_t gid,egid,sgid,fsgid;
- struct group_info *group_info;
- kernel_cap_t cap_effective, cap_inheritable, cap_permitted, cap_bset;
- struct user_struct *user;
- unsigned securebits;
-#ifdef CONFIG_KEYS
- unsigned char jit_keyring; /* default keyring to attach requested keys to */
- struct key *request_key_auth; /* assumed request_key authority */
- struct key *thread_keyring; /* keyring private to this thread */
-#endif
+ const struct cred *real_cred; /* objective and real subjective task
+ * credentials (COW) */
+ const struct cred *cred; /* effective (overridable) subjective task
+ * credentials (COW) */
+ struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */
+
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
-#ifdef CONFIG_SECURITY
- void *security;
-#endif
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
uid_t loginuid;
#ifdef CONFIG_SMP
extern int set_cpus_allowed_ptr(struct task_struct *p,
- const cpumask_t *new_mask);
+ const struct cpumask *new_mask);
#else
static inline int set_cpus_allowed_ptr(struct task_struct *p,
- const cpumask_t *new_mask)
+ const struct cpumask *new_mask)
{
- if (!cpu_isset(0, *new_mask))
+ if (!cpumask_test_cpu(0, new_mask))
return -EINVAL;
return 0;
}
return u;
}
extern void free_uid(struct user_struct *);
-extern void switch_uid(struct user_struct *);
extern void release_uids(struct user_namespace *ns);
#include <asm/current.h>
extern void sched_fork(struct task_struct *p, int clone_flags);
extern void sched_dead(struct task_struct *p);
-extern int in_group_p(gid_t);
-extern int in_egroup_p(gid_t);
-
extern void proc_caches_init(void);
extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
#define for_each_process(p) \
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
+extern bool is_single_threaded(struct task_struct *);
+
/*
* Careful: do_each_thread/while_each_thread is a double loop so
* 'break' will not work as expected - use goto instead.
}
#endif
- extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
- extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
-
- extern int sched_mc_power_savings, sched_smt_power_savings;
+ extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
extern void normalize_rt_tasks(void);
Say N.
+config KALLSYMS_STRIP_GENERATED
+ bool "Strip machine generated symbols from kallsyms"
+ depends on KALLSYMS_ALL
+ default y
+ help
+ Say N if you want kallsyms to retain even machine generated symbols.
+
config KALLSYMS_EXTRA_PASS
bool "Do an extra kallsyms pass"
depends on KALLSYMS
endif # MODULES
+ config INIT_ALL_POSSIBLE
+ bool
+ help
+ Back when each arch used to define their own cpu_online_map and
+ cpu_possible_map, some of them chose to initialize cpu_possible_map
+ with all 1s, and others with all 0s. When they were centralised,
+ it was better to provide this option than to break all the archs
+ and have several arch maintainers persuing me down dark alleys.
+
config STOP_MACHINE
bool
default y
config PREEMPT_NOTIFIERS
bool
+choice
+ prompt "RCU Implementation"
+ default CLASSIC_RCU
+
config CLASSIC_RCU
- def_bool !PREEMPT_RCU
+ bool "Classic RCU"
help
This option selects the classic RCU implementation that is
designed for best read-side performance on non-realtime
- systems. Classic RCU is the default. Note that the
- PREEMPT_RCU symbol is used to select/deselect this option.
+ systems.
+
+ Select this option if you are unsure.
+
+config TREE_RCU
+ bool "Tree-based hierarchical RCU"
+ help
+ This option selects the RCU implementation that is
+ designed for very large SMP system with hundreds or
+ thousands of CPUs.
+
+config PREEMPT_RCU
+ bool "Preemptible RCU"
+ depends on PREEMPT
+ help
+ This option reduces the latency of the kernel by making certain
+ RCU sections preemptible. Normally RCU code is non-preemptible, if
+ this option is selected then read-only RCU sections become
+ preemptible. This helps latency, but may expose bugs due to
+ now-naive assumptions about each RCU read-side critical section
+ remaining on a given CPU through its execution.
+
+endchoice
+
+config RCU_TRACE
+ bool "Enable tracing for RCU"
+ depends on TREE_RCU || PREEMPT_RCU
+ help
+ This option provides tracing in RCU which presents stats
+ in debugfs for debugging RCU implementation.
+
+ Say Y here if you want to enable RCU tracing
+ Say N if you are unsure.
+
+config RCU_FANOUT
+ int "Tree-based hierarchical RCU fanout value"
+ range 2 64 if 64BIT
+ range 2 32 if !64BIT
+ depends on TREE_RCU
+ default 64 if 64BIT
+ default 32 if !64BIT
+ help
+ This option controls the fanout of hierarchical implementations
+ of RCU, allowing RCU to work efficiently on machines with
+ large numbers of CPUs. This value must be at least the cube
+ root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+ systems and up to 262,144 for 64-bit systems.
+
+ Select a specific number if testing RCU itself.
+ Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+ bool "Disable tree-based hierarchical RCU auto-balancing"
+ depends on TREE_RCU
+ default n
+ help
+ This option forces use of the exact RCU_FANOUT value specified,
+ regardless of imbalances in the hierarchy. This is useful for
+ testing RCU itself, and might one day be useful on systems with
+ strong NUMA behavior.
+
+ Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+ Say N if unsure.
+
+config TREE_RCU_TRACE
+ def_bool RCU_TRACE && TREE_RCU
+ select DEBUG_FS
+ help
+ This option provides tracing for the TREE_RCU implementation,
+ permitting Makefile to trivially select kernel/rcutree_trace.c.
+
+config PREEMPT_RCU_TRACE
+ def_bool RCU_TRACE && PREEMPT_RCU
+ select DEBUG_FS
+ help
+ This option provides tracing for the PREEMPT_RCU implementation,
+ permitting Makefile to trivially select kernel/rcupreempt_trace.c.
desc->irq_count = 0;
desc->irqs_unhandled = 0;
#ifdef CONFIG_SMP
- cpus_setall(desc->affinity);
+ cpumask_setall(&desc->affinity);
#endif
spin_unlock_irqrestore(&desc->lock, flags);
}
return -ENODEV;
}
+ type &= IRQ_TYPE_SENSE_MASK;
if (type == IRQ_TYPE_NONE)
return 0;
* @cpumask: cpumask
*
*/
- int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
+ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
#ifdef CONFIG_GENERIC_PENDING_IRQ
if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
- desc->affinity = cpumask;
+ cpumask_copy(&desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
} else {
desc->status |= IRQ_MOVE_PENDING;
- desc->pending_mask = cpumask;
+ cpumask_copy(&desc->pending_mask, cpumask);
}
#else
- desc->affinity = cpumask;
+ cpumask_copy(&desc->affinity, cpumask);
desc->chip->set_affinity(irq, cpumask);
#endif
desc->status |= IRQ_AFFINITY_SET;
*/
int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
{
- cpumask_t mask;
-
if (!irq_can_set_affinity(irq))
return 0;
- cpus_and(mask, cpu_online_map, irq_default_affinity);
-
/*
* Preserve an userspace affinity setup, but make sure that
* one of the targets is online.
*/
if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpus_intersects(desc->affinity, cpu_online_map))
- mask = desc->affinity;
+ if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+ < nr_cpu_ids)
+ goto set_affinity;
else
desc->status &= ~IRQ_AFFINITY_SET;
}
- desc->affinity = mask;
- desc->chip->set_affinity(irq, mask);
+ cpumask_and(&desc->affinity, cpu_online_mask, &irq_default_affinity);
+ set_affinity:
+ desc->chip->set_affinity(irq, &desc->affinity);
return 0;
}
return 0;
}
- ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
+ /* caller masked out all except trigger mode flags */
+ ret = chip->set_type(irq, flags);
if (ret)
pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
- (int)(flags & IRQF_TRIGGER_MASK),
- irq, chip->set_type);
+ (int)flags, irq, chip->set_type);
else {
+ if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
+ flags |= IRQ_LEVEL;
/* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
- desc->status &= ~IRQ_TYPE_SENSE_MASK;
- desc->status |= flags & IRQ_TYPE_SENSE_MASK;
+ desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
+ desc->status |= flags;
}
return ret;
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
- ret = __irq_set_trigger(desc, irq, new->flags);
+ ret = __irq_set_trigger(desc, irq,
+ new->flags & IRQF_TRIGGER_MASK);
if (ret) {
spin_unlock_irqrestore(&desc->lock, flags);
struct irq_desc *desc;
int retval;
+ /*
+ * handle_IRQ_event() always ignores IRQF_DISABLED except for
+ * the _first_ irqaction (sigh). That can cause oopsing, but
+ * the behavior is classified as "will not fix" so we need to
+ * start nudging drivers away from using that idiom.
+ */
+ if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
+ == (IRQF_SHARED|IRQF_DISABLED))
+ pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
+ "guaranteed on shared IRQs\n",
+ irq, devname);
+
#ifdef CONFIG_LOCKDEP
/*
* Lockdep wants atomic interrupt handlers:
hrtimer_init(&rt_b->rt_period_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rt_b->rt_period_timer.function = sched_rt_period_timer;
- rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
}
static inline int rt_bandwidth_enabled(void)
struct task_group *tg;
#ifdef CONFIG_USER_SCHED
- tg = p->user->tg;
+ rcu_read_lock();
+ tg = __task_cred(p)->user->tg;
+ rcu_read_unlock();
#elif defined(CONFIG_CGROUP_SCHED)
tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
struct task_group, css);
*/
struct root_domain {
atomic_t refcount;
- cpumask_t span;
- cpumask_t online;
+ cpumask_var_t span;
+ cpumask_var_t online;
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
- cpumask_t rto_mask;
+ cpumask_var_t rto_mask;
atomic_t rto_count;
#ifdef CONFIG_SMP
struct cpupri cpupri;
#endif
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ /*
+ * Preferred wake up cpu nominated by sched_mc balance that will be
+ * used when most cpus are idle in the system indicating overall very
+ * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+ */
+ unsigned int sched_mc_preferred_wakeup_cpu;
+ #endif
};
/*
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned int yld_exp_empty;
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
rq->hrtick_timer.function = hrtick;
- rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
struct sched_domain *sd = data;
int i;
- for_each_cpu_mask(i, sd->span) {
+ for_each_cpu(i, sched_domain_span(sd)) {
/*
* If there are currently no tasks on the cpu pretend there
* is one of average load so that when a new task gets to
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
shares = tg->shares;
- for_each_cpu_mask(i, sd->span)
+ for_each_cpu(i, sched_domain_span(sd))
update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
clock_offset = old_rq->clock - new_rq->clock;
+ trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
p->se.wait_start -= clock_offset;
int i;
/* Skip over this group if it has no CPUs allowed */
- if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ if (!cpumask_intersects(sched_group_cpus(group),
+ &p->cpus_allowed))
continue;
- local_group = cpu_isset(this_cpu, group->cpumask);
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
avg_load = 0;
- for_each_cpu_mask_nr(i, group->cpumask) {
+ for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
* find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
- find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
- cpumask_t *tmp)
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
/* Traverse only the allowed CPUs */
- cpus_and(*tmp, group->cpumask, p->cpus_allowed);
-
- for_each_cpu_mask_nr(i, *tmp) {
+ for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
update_shares(sd);
while (sd) {
- cpumask_t span, tmpmask;
struct sched_group *group;
int new_cpu, weight;
continue;
}
- span = sd->span;
group = find_idlest_group(sd, t, cpu);
if (!group) {
sd = sd->child;
continue;
}
- new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+ new_cpu = find_idlest_cpu(group, t, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
+ weight = cpumask_weight(sched_domain_span(sd));
sd = NULL;
- weight = cpus_weight(span);
for_each_domain(cpu, tmp) {
- if (weight <= cpus_weight(tmp->span))
+ if (weight <= cpumask_weight(sched_domain_span(tmp)))
break;
if (tmp->flags & flag)
sd = tmp;
cpu = task_cpu(p);
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
update_shares(sd);
break;
}
smp_wmb();
rq = task_rq_lock(p, &flags);
+ update_rq_clock(rq);
old_state = p->state;
if (!(old_state & state))
goto out;
else {
struct sched_domain *sd;
for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
schedstat_inc(sd, ttwu_wake_remote);
break;
}
schedstat_inc(p, se.nr_wakeups_local);
else
schedstat_inc(p, se.nr_wakeups_remote);
- update_rq_clock(rq);
activate_task(rq, p, 1);
success = 1;
out_running:
- trace_sched_wakeup(rq, p);
+ trace_sched_wakeup(rq, p, success);
check_preempt_curr(rq, p, sync);
p->state = TASK_RUNNING;
p->sched_class->task_new(rq, p);
inc_nr_running(rq);
}
- trace_sched_wakeup_new(rq, p);
+ trace_sched_wakeup_new(rq, p, 1);
check_preempt_curr(rq, p, 0);
#ifdef CONFIG_SMP
if (p->sched_class->task_wake_up)
struct rq *rq;
rq = task_rq_lock(p, &flags);
- if (!cpu_isset(dest_cpu, p->cpus_allowed)
+ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
|| unlikely(!cpu_active(dest_cpu)))
goto out;
- trace_sched_migrate_task(rq, p, dest_cpu);
/* force the process onto the specified CPU */
if (migrate_task(p, dest_cpu, &req)) {
/* Need to wait for migration thread (might exit: take ref). */
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
- if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
schedstat_inc(p, se.nr_failed_migrations_affine);
return 0;
}
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const cpumask_t *cpus, int *balance)
+ int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
- local_group = cpu_isset(this_cpu, group->cpumask);
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
if (local_group)
- balance_cpu = first_cpu(group->cpumask);
+ balance_cpu = cpumask_first(sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
max_cpu_load = 0;
min_cpu_load = ~0UL;
- for_each_cpu_mask_nr(i, group->cpumask) {
- struct rq *rq;
-
- if (!cpu_isset(i, *cpus))
- continue;
-
- rq = cpu_rq(i);
+ for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
*/
if ((sum_nr_running < min_nr_running) ||
(sum_nr_running == min_nr_running &&
- first_cpu(group->cpumask) <
- first_cpu(group_min->cpumask))) {
+ cpumask_first(sched_group_cpus(group)) >
+ cpumask_first(sched_group_cpus(group_min)))) {
group_min = group;
min_nr_running = sum_nr_running;
min_load_per_task = sum_weighted_load /
if (sum_nr_running <= group_capacity - 1) {
if (sum_nr_running > leader_nr_running ||
(sum_nr_running == leader_nr_running &&
- first_cpu(group->cpumask) >
- first_cpu(group_leader->cpumask))) {
+ cpumask_first(sched_group_cpus(group)) <
+ cpumask_first(sched_group_cpus(group_leader)))) {
group_leader = group;
leader_nr_running = sum_nr_running;
}
if (this == group_leader && group_leader != group_min) {
*imbalance = min_load_per_task;
+ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ cpumask_first(sched_group_cpus(group_leader));
+ }
return group_min;
}
#endif
*/
static struct rq *
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
- unsigned long imbalance, const cpumask_t *cpus)
+ unsigned long imbalance, const struct cpumask *cpus)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
int i;
- for_each_cpu_mask_nr(i, group->cpumask) {
+ for_each_cpu(i, sched_group_cpus(group)) {
unsigned long wl;
- if (!cpu_isset(i, *cpus))
+ if (!cpumask_test_cpu(i, cpus))
continue;
rq = cpu_rq(i);
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *balance, cpumask_t *cpus)
+ int *balance, struct cpumask *cpus)
{
int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
- cpus_setall(*cpus);
+ cpumask_setall(cpus);
/*
* When power savings policy is enabled for the parent domain, idle
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
- cpu_clear(cpu_of(busiest), *cpus);
- if (!cpus_empty(*cpus))
+ cpumask_clear_cpu(cpu_of(busiest), cpus);
+ if (!cpumask_empty(cpus))
goto redo;
goto out_balanced;
}
/* don't kick the migration_thread, if the curr
* task on busiest cpu can't be moved to this_cpu
*/
- if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ if (!cpumask_test_cpu(this_cpu,
+ &busiest->curr->cpus_allowed)) {
spin_unlock_irqrestore(&busiest->lock, flags);
all_pinned = 1;
goto out_one_pinned;
*/
static int
load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
- cpumask_t *cpus)
+ struct cpumask *cpus)
{
struct sched_group *group;
struct rq *busiest = NULL;
int sd_idle = 0;
int all_pinned = 0;
- cpus_setall(*cpus);
+ cpumask_setall(cpus);
/*
* When power savings policy is enabled for the parent domain, idle
double_unlock_balance(this_rq, busiest);
if (unlikely(all_pinned)) {
- cpu_clear(cpu_of(busiest), *cpus);
- if (!cpus_empty(*cpus))
+ cpumask_clear_cpu(cpu_of(busiest), cpus);
+ if (!cpumask_empty(cpus))
goto redo;
}
}
if (!ld_moved) {
+ int active_balance = 0;
+
schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
+
+ if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+ return -1;
+
+ if (sd->nr_balance_failed++ < 2)
+ return -1;
+
+ /*
+ * The only task running in a non-idle cpu can be moved to this
+ * cpu in an attempt to completely freeup the other CPU
+ * package. The same method used to move task in load_balance()
+ * have been extended for load_balance_newidle() to speedup
+ * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+ *
+ * The package power saving logic comes from
+ * find_busiest_group(). If there are no imbalance, then
+ * f_b_g() will return NULL. However when sched_mc={1,2} then
+ * f_b_g() will select a group from which a running task may be
+ * pulled to this cpu in order to make the other package idle.
+ * If there is no opportunity to make a package idle and if
+ * there are no imbalance, then f_b_g() will return NULL and no
+ * action will be taken in load_balance_newidle().
+ *
+ * Under normal task pull operation due to imbalance, there
+ * will be more than one task in the source run queue and
+ * move_tasks() will succeed. ld_moved will be true and this
+ * active balance code will not be triggered.
+ */
+
+ /* Lock busiest in correct order while this_rq is held */
+ double_lock_balance(this_rq, busiest);
+
+ /*
+ * don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ double_unlock_balance(this_rq, busiest);
+ all_pinned = 1;
+ return ld_moved;
+ }
+
+ if (!busiest->active_balance) {
+ busiest->active_balance = 1;
+ busiest->push_cpu = this_cpu;
+ active_balance = 1;
+ }
+
+ double_unlock_balance(this_rq, busiest);
+ if (active_balance)
+ wake_up_process(busiest->migration_thread);
+
} else
sd->nr_balance_failed = 0;
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
- cpumask_t tmpmask;
+ cpumask_var_t tmpmask;
+
+ if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+ return;
for_each_domain(this_cpu, sd) {
unsigned long interval;
if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu, this_rq,
- sd, &tmpmask);
+ sd, tmpmask);
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
*/
this_rq->next_balance = next_balance;
}
+ free_cpumask_var(tmpmask);
}
/*
/* Search for an sd spanning us and the target CPU. */
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
- cpu_isset(busiest_cpu, sd->span))
+ cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
#ifdef CONFIG_NO_HZ
static struct {
atomic_t load_balancer;
- cpumask_t cpu_mask;
+ cpumask_var_t cpu_mask;
} nohz ____cacheline_aligned = {
.load_balancer = ATOMIC_INIT(-1),
- .cpu_mask = CPU_MASK_NONE,
};
/*
int cpu = smp_processor_id();
if (stop_tick) {
- cpu_set(cpu, nohz.cpu_mask);
+ cpumask_set_cpu(cpu, nohz.cpu_mask);
cpu_rq(cpu)->in_nohz_recently = 1;
/*
}
/* time for ilb owner also to sleep */
- if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
if (atomic_read(&nohz.load_balancer) == cpu)
atomic_set(&nohz.load_balancer, -1);
return 0;
} else if (atomic_read(&nohz.load_balancer) == cpu)
return 1;
} else {
- if (!cpu_isset(cpu, nohz.cpu_mask))
+ if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
return 0;
- cpu_clear(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.cpu_mask);
if (atomic_read(&nohz.load_balancer) == cpu)
if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
- cpumask_t tmp;
+ cpumask_var_t tmp;
+
+ /* Fails alloc? Rebalancing probably not a priority right now. */
+ if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+ return;
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+ if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
+
+ free_cpumask_var(tmp);
}
/*
*/
if (this_rq->idle_at_tick &&
atomic_read(&nohz.load_balancer) == this_cpu) {
- cpumask_t cpus = nohz.cpu_mask;
struct rq *rq;
int balance_cpu;
- cpu_clear(this_cpu, cpus);
- for_each_cpu_mask_nr(balance_cpu, cpus) {
+ for_each_cpu(balance_cpu, nohz.cpu_mask) {
+ if (balance_cpu == this_cpu)
+ continue;
+
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
rq->in_nohz_recently = 0;
if (atomic_read(&nohz.load_balancer) == cpu) {
- cpu_clear(cpu, nohz.cpu_mask);
+ cpumask_clear_cpu(cpu, nohz.cpu_mask);
atomic_set(&nohz.load_balancer, -1);
}
* TBD: Traverse the sched domains and nominate
* the nearest cpu in the nohz.cpu_mask.
*/
- int ilb = first_cpu(nohz.cpu_mask);
+ int ilb = cpumask_first(nohz.cpu_mask);
if (ilb < nr_cpu_ids)
resched_cpu(ilb);
* cpus with ticks stopped, is it time for that to stop?
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
- cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
resched_cpu(cpu);
return;
}
* someone else, then no need raise the SCHED_SOFTIRQ
*/
if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
- cpu_isset(cpu, nohz.cpu_mask))
+ cpumask_test_cpu(cpu, nohz.cpu_mask))
return;
#endif
if (time_after_eq(jiffies, rq->next_balance))
set_load_weight(p);
}
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred;
+ bool match;
+
+ rcu_read_lock();
+ pcred = __task_cred(p);
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ rcu_read_unlock();
+ return match;
+}
+
static int __sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param, bool user)
{
return -EPERM;
/* can't change other user's priorities */
- if ((current->euid != p->euid) &&
- (current->euid != p->uid))
+ if (!check_same_owner(p))
return -EPERM;
}
return retval;
}
- long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
{
- cpumask_t cpus_allowed;
- cpumask_t new_mask = *in_mask;
+ cpumask_var_t cpus_allowed, new_mask;
struct task_struct *p;
int retval;
get_task_struct(p);
read_unlock(&tasklist_lock);
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_put_task;
+ }
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
retval = -EPERM;
- if ((current->euid != p->euid) && (current->euid != p->uid) &&
- !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p, 0, NULL);
if (retval)
goto out_unlock;
- cpuset_cpus_allowed(p, &cpus_allowed);
- cpus_and(new_mask, new_mask, cpus_allowed);
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, in_mask, cpus_allowed);
again:
- retval = set_cpus_allowed_ptr(p, &new_mask);
+ retval = set_cpus_allowed_ptr(p, new_mask);
if (!retval) {
- cpuset_cpus_allowed(p, &cpus_allowed);
- if (!cpus_subset(new_mask, cpus_allowed)) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
/*
* We must have raced with a concurrent cpuset
* update. Just reset the cpus_allowed to the
* cpuset's cpus_allowed
*/
- new_mask = cpus_allowed;
+ cpumask_copy(new_mask, cpus_allowed);
goto again;
}
}
out_unlock:
+ free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ out_put_task:
put_task_struct(p);
put_online_cpus();
return retval;
}
static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
- cpumask_t *new_mask)
+ struct cpumask *new_mask)
{
- if (len < sizeof(cpumask_t)) {
- memset(new_mask, 0, sizeof(cpumask_t));
- } else if (len > sizeof(cpumask_t)) {
- len = sizeof(cpumask_t);
- }
+ if (len < cpumask_size())
+ cpumask_clear(new_mask);
+ else if (len > cpumask_size())
+ len = cpumask_size();
+
return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
}
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
unsigned long __user *user_mask_ptr)
{
- cpumask_t new_mask;
+ cpumask_var_t new_mask;
int retval;
- retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
- if (retval)
- return retval;
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+ return -ENOMEM;
- return sched_setaffinity(pid, &new_mask);
+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+ if (retval == 0)
+ retval = sched_setaffinity(pid, new_mask);
+ free_cpumask_var(new_mask);
+ return retval;
}
- long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
int retval;
if (retval)
goto out_unlock;
- cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+ cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
out_unlock:
read_unlock(&tasklist_lock);
unsigned long __user *user_mask_ptr)
{
int ret;
- cpumask_t mask;
+ cpumask_var_t mask;
- if (len < sizeof(cpumask_t))
+ if (len < cpumask_size())
return -EINVAL;
- ret = sched_getaffinity(pid, &mask);
- if (ret < 0)
- return ret;
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
- if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
- return -EFAULT;
+ ret = sched_getaffinity(pid, mask);
+ if (ret == 0) {
+ if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+ ret = -EFAULT;
+ else
+ ret = cpumask_size();
+ }
+ free_cpumask_var(mask);
- return sizeof(cpumask_t);
+ return ret;
}
/**
idle->se.exec_start = sched_clock();
idle->prio = idle->normal_prio = MAX_PRIO;
- idle->cpus_allowed = cpumask_of_cpu(cpu);
+ cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
__set_task_cpu(idle, cpu);
rq->curr = rq->idle = idle;
* indicates which cpus entered this state. This is used
* in the rcu update to wait only for active cpus. For system
* which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_MASK_NONE.
+ * always be CPU_BITS_NONE.
*/
- cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ cpumask_var_t nohz_cpu_mask;
/*
* Increase the granularity value when there are more CPUs,
* task must not exit() & deallocate itself prematurely. The
* call is not atomic; no spinlocks may be held.
*/
- int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
struct migration_req req;
unsigned long flags;
int ret = 0;
rq = task_rq_lock(p, &flags);
- if (!cpus_intersects(*new_mask, cpu_online_map)) {
+ if (!cpumask_intersects(new_mask, cpu_online_mask)) {
ret = -EINVAL;
goto out;
}
if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
- !cpus_equal(p->cpus_allowed, *new_mask))) {
+ !cpumask_equal(&p->cpus_allowed, new_mask))) {
ret = -EINVAL;
goto out;
}
if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, new_mask);
else {
- p->cpus_allowed = *new_mask;
- p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
}
/* Can the task run on the task's current CPU? If so, we're done */
- if (cpu_isset(task_cpu(p), *new_mask))
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+ if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
/* Need help from migration thread: drop lock and wait. */
task_rq_unlock(rq, &flags);
wake_up_process(rq->migration_thread);
if (task_cpu(p) != src_cpu)
goto done;
/* Affinity changed (again). */
- if (!cpu_isset(dest_cpu, p->cpus_allowed))
+ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
goto fail;
on_rq = p->se.on_rq;
*/
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
- unsigned long flags;
- cpumask_t mask;
- struct rq *rq;
int dest_cpu;
+ /* FIXME: Use cpumask_of_node here. */
+ cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
+ const struct cpumask *nodemask = &_nodemask;
+
+ again:
+ /* Look for allowed, online CPU in same node. */
+ for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+ goto move;
+
+ /* Any allowed, online CPU? */
+ dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+ if (dest_cpu < nr_cpu_ids)
+ goto move;
+
+ /* No more Mr. Nice Guy. */
+ if (dest_cpu >= nr_cpu_ids) {
+ cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+ dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
- do {
- /* On same node? */
- mask = node_to_cpumask(cpu_to_node(dead_cpu));
- cpus_and(mask, mask, p->cpus_allowed);
- dest_cpu = any_online_cpu(mask);
-
- /* On any allowed CPU? */
- if (dest_cpu >= nr_cpu_ids)
- dest_cpu = any_online_cpu(p->cpus_allowed);
-
- /* No more Mr. Nice Guy. */
- if (dest_cpu >= nr_cpu_ids) {
- cpumask_t cpus_allowed;
-
- cpuset_cpus_allowed_locked(p, &cpus_allowed);
- /*
- * Try to stay on the same cpuset, where the
- * current cpuset may be a subset of all cpus.
- * The cpuset_cpus_allowed_locked() variant of
- * cpuset_cpus_allowed() will not block. It must be
- * called within calls to cpuset_lock/cpuset_unlock.
- */
- rq = task_rq_lock(p, &flags);
- p->cpus_allowed = cpus_allowed;
- dest_cpu = any_online_cpu(p->cpus_allowed);
- task_rq_unlock(rq, &flags);
-
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk(KERN_INFO "process %d (%s) no "
- "longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, dead_cpu);
- }
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk(KERN_INFO "process %d (%s) no "
+ "longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, dead_cpu);
}
- } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+ }
+
+ move:
+ /* It can have affinity changed while we were choosing. */
+ if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+ goto again;
}
/*
*/
static void migrate_nr_uninterruptible(struct rq *rq_src)
{
- struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+ struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
unsigned long flags;
local_irq_save(flags);
if (!rq->online) {
const struct sched_class *class;
- cpu_set(rq->cpu, rq->rd->online);
+ cpumask_set_cpu(rq->cpu, rq->rd->online);
rq->online = 1;
for_each_class(class) {
class->rq_offline(rq);
}
- cpu_clear(rq->cpu, rq->rd->online);
+ cpumask_clear_cpu(rq->cpu, rq->rd->online);
rq->online = 0;
}
}
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
- BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
break;
/* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,
- any_online_cpu(cpu_online_map));
+ cpumask_any(cpu_online_mask));
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
rq = cpu_rq(cpu);
spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
- BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
spin_unlock_irqrestore(&rq->lock, flags);
#ifdef CONFIG_SCHED_DEBUG
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
- cpumask_t *groupmask)
+ struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
char str[256];
- cpulist_scnprintf(str, sizeof(str), sd->span);
- cpus_clear(*groupmask);
+ cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+ cpumask_clear(groupmask);
printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
printk(KERN_CONT "span %s level %s\n", str, sd->name);
- if (!cpu_isset(cpu, sd->span)) {
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
printk(KERN_ERR "ERROR: domain->span does not contain "
"CPU%d\n", cpu);
}
- if (!cpu_isset(cpu, group->cpumask)) {
+ if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
printk(KERN_ERR "ERROR: domain->groups does not contain"
" CPU%d\n", cpu);
}
break;
}
- if (!cpus_weight(group->cpumask)) {
+ if (!cpumask_weight(sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
}
- if (cpus_intersects(*groupmask, group->cpumask)) {
+ if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: repeated CPUs\n");
break;
}
- cpus_or(*groupmask, *groupmask, group->cpumask);
+ cpumask_or(groupmask, groupmask, sched_group_cpus(group));
- cpulist_scnprintf(str, sizeof(str), group->cpumask);
+ cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
group = group->next;
} while (group != sd->groups);
printk(KERN_CONT "\n");
- if (!cpus_equal(sd->span, *groupmask))
+ if (!cpumask_equal(sched_domain_span(sd), groupmask))
printk(KERN_ERR "ERROR: groups don't span domain->span\n");
- if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+ if (sd->parent &&
+ !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
printk(KERN_ERR "ERROR: parent span is not a superset "
"of domain->span\n");
return 0;
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
- cpumask_t *groupmask;
+ cpumask_var_t groupmask;
int level = 0;
if (!sd) {
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
- groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
- if (!groupmask) {
+ if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
return;
}
if (!sd)
break;
}
- kfree(groupmask);
+ free_cpumask_var(groupmask);
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
static int sd_degenerate(struct sched_domain *sd)
{
- if (cpus_weight(sd->span) == 1)
+ if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1;
/* Following flags need at least 2 groups */
if (sd_degenerate(parent))
return 1;
- if (!cpus_equal(sd->span, parent->span))
+ if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
return 0;
/* Does parent contain flags not in child? */
return 1;
}
+ static void free_rootdomain(struct root_domain *rd)
+ {
+ cpupri_cleanup(&rd->cpupri);
+
+ free_cpumask_var(rd->rto_mask);
+ free_cpumask_var(rd->online);
+ free_cpumask_var(rd->span);
+ kfree(rd);
+ }
+
static void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
unsigned long flags;
if (rq->rd) {
struct root_domain *old_rd = rq->rd;
- if (cpu_isset(rq->cpu, old_rd->online))
+ if (cpumask_test_cpu(rq->cpu, old_rd->online))
set_rq_offline(rq);
- cpu_clear(rq->cpu, old_rd->span);
+ cpumask_clear_cpu(rq->cpu, old_rd->span);
if (atomic_dec_and_test(&old_rd->refcount))
- kfree(old_rd);
+ free_rootdomain(old_rd);
}
atomic_inc(&rd->refcount);
rq->rd = rd;
- cpu_set(rq->cpu, rd->span);
- if (cpu_isset(rq->cpu, cpu_online_map))
+ cpumask_set_cpu(rq->cpu, rd->span);
+ if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
set_rq_online(rq);
spin_unlock_irqrestore(&rq->lock, flags);
}
- static void init_rootdomain(struct root_domain *rd)
+ static int init_rootdomain(struct root_domain *rd, bool bootmem)
{
memset(rd, 0, sizeof(*rd));
- cpus_clear(rd->span);
- cpus_clear(rd->online);
+ if (bootmem) {
+ alloc_bootmem_cpumask_var(&def_root_domain.span);
+ alloc_bootmem_cpumask_var(&def_root_domain.online);
+ alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+ cpupri_init(&rd->cpupri, true);
+ return 0;
+ }
+
+ if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+ goto free_rd;
+ if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+ goto free_span;
+ if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+ goto free_online;
+
+ if (cpupri_init(&rd->cpupri, false) != 0)
+ goto free_rto_mask;
+ return 0;
- cpupri_init(&rd->cpupri);
+ free_rto_mask:
+ free_cpumask_var(rd->rto_mask);
+ free_online:
+ free_cpumask_var(rd->online);
+ free_span:
+ free_cpumask_var(rd->span);
+ free_rd:
+ kfree(rd);
+ return -ENOMEM;
}
static void init_defrootdomain(void)
{
- init_rootdomain(&def_root_domain);
+ init_rootdomain(&def_root_domain, true);
+
atomic_set(&def_root_domain.refcount, 1);
}
if (!rd)
return NULL;
- init_rootdomain(rd);
+ if (init_rootdomain(rd, false) != 0) {
+ kfree(rd);
+ return NULL;
+ }
return rd;
}
}
/* cpus with isolated domains */
- static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ static cpumask_var_t cpu_isolated_map;
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
- static int __initdata ints[NR_CPUS];
- int i;
-
- str = get_options(str, ARRAY_SIZE(ints), ints);
- cpus_clear(cpu_isolated_map);
- for (i = 1; i <= ints[0]; i++)
- if (ints[i] < NR_CPUS)
- cpu_set(ints[i], cpu_isolated_map);
+ cpulist_parse(str, cpu_isolated_map);
return 1;
}
/*
* init_sched_build_groups takes the cpumask we wish to span, and a pointer
* to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
- * (due to the fact that we keep track of groups covered with a cpumask_t).
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
*
* init_sched_build_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
static void
- init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
- int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ init_sched_build_groups(const struct cpumask *span,
+ const struct cpumask *cpu_map,
+ int (*group_fn)(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg,
- cpumask_t *tmpmask),
- cpumask_t *covered, cpumask_t *tmpmask)
+ struct cpumask *tmpmask),
+ struct cpumask *covered, struct cpumask *tmpmask)
{
struct sched_group *first = NULL, *last = NULL;
int i;
- cpus_clear(*covered);
+ cpumask_clear(covered);
- for_each_cpu_mask_nr(i, *span) {
+ for_each_cpu(i, span) {
struct sched_group *sg;
int group = group_fn(i, cpu_map, &sg, tmpmask);
int j;
- if (cpu_isset(i, *covered))
+ if (cpumask_test_cpu(i, covered))
continue;
- cpus_clear(sg->cpumask);
+ cpumask_clear(sched_group_cpus(sg));
sg->__cpu_power = 0;
- for_each_cpu_mask_nr(j, *span) {
+ for_each_cpu(j, span) {
if (group_fn(j, cpu_map, NULL, tmpmask) != group)
continue;
- cpu_set(j, *covered);
- cpu_set(j, sg->cpumask);
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
}
if (!first)
first = sg;
* should be one that prevents unnecessary balancing, but also spreads tasks
* out optimally.
*/
- static void sched_domain_node_span(int node, cpumask_t *span)
+ static void sched_domain_node_span(int node, struct cpumask *span)
{
nodemask_t used_nodes;
+ /* FIXME: use cpumask_of_node() */
node_to_cpumask_ptr(nodemask, node);
int i;
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+ /*
+ * The cpus mask in sched_group and sched_domain hangs off the end.
+ * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ * for nr_cpu_ids < CONFIG_NR_CPUS.
+ */
+ struct static_sched_group {
+ struct sched_group sg;
+ DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ };
+
+ struct static_sched_domain {
+ struct sched_domain sd;
+ DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ };
+
/*
* SMT sched-domains:
*/
#ifdef CONFIG_SCHED_SMT
- static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
static int
- cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- cpumask_t *unused)
+ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg, struct cpumask *unused)
{
if (sg)
- *sg = &per_cpu(sched_group_cpus, cpu);
+ *sg = &per_cpu(sched_group_cpus, cpu).sg;
return cpu;
}
#endif /* CONFIG_SCHED_SMT */
* multi-core sched-domains:
*/
#ifdef CONFIG_SCHED_MC
- static DEFINE_PER_CPU(struct sched_domain, core_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
#endif /* CONFIG_SCHED_MC */
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- cpumask_t *mask)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg, struct cpumask *mask)
{
int group;
- *mask = per_cpu(cpu_sibling_map, cpu);
- cpus_and(*mask, *mask, *cpu_map);
- group = first_cpu(*mask);
+ cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ group = cpumask_first(mask);
if (sg)
- *sg = &per_cpu(sched_group_core, group);
+ *sg = &per_cpu(sched_group_core, group).sg;
return group;
}
#elif defined(CONFIG_SCHED_MC)
static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- cpumask_t *unused)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg, struct cpumask *unused)
{
if (sg)
- *sg = &per_cpu(sched_group_core, cpu);
+ *sg = &per_cpu(sched_group_core, cpu).sg;
return cpu;
}
#endif
- static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
static int
- cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
- cpumask_t *mask)
+ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg, struct cpumask *mask)
{
int group;
#ifdef CONFIG_SCHED_MC
+ /* FIXME: Use cpu_coregroup_mask. */
*mask = cpu_coregroup_map(cpu);
cpus_and(*mask, *mask, *cpu_map);
- group = first_cpu(*mask);
+ group = cpumask_first(mask);
#elif defined(CONFIG_SCHED_SMT)
- *mask = per_cpu(cpu_sibling_map, cpu);
- cpus_and(*mask, *mask, *cpu_map);
- group = first_cpu(*mask);
+ cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+ group = cpumask_first(mask);
#else
group = cpu;
#endif
if (sg)
- *sg = &per_cpu(sched_group_phys, group);
+ *sg = &per_cpu(sched_group_phys, group).sg;
return group;
}
static struct sched_group ***sched_group_nodes_bycpu;
static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
- static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
- struct sched_group **sg, cpumask_t *nodemask)
+ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg,
+ struct cpumask *nodemask)
{
int group;
+ /* FIXME: use cpumask_of_node */
+ node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
- *nodemask = node_to_cpumask(cpu_to_node(cpu));
- cpus_and(*nodemask, *nodemask, *cpu_map);
- group = first_cpu(*nodemask);
+ cpumask_and(nodemask, pnodemask, cpu_map);
+ group = cpumask_first(nodemask);
if (sg)
- *sg = &per_cpu(sched_group_allnodes, group);
+ *sg = &per_cpu(sched_group_allnodes, group).sg;
return group;
}
if (!sg)
return;
do {
- for_each_cpu_mask_nr(j, sg->cpumask) {
+ for_each_cpu(j, sched_group_cpus(sg)) {
struct sched_domain *sd;
- sd = &per_cpu(phys_domains, j);
- if (j != first_cpu(sd->groups->cpumask)) {
+ sd = &per_cpu(phys_domains, j).sd;
+ if (j != cpumask_first(sched_group_cpus(sd->groups))) {
/*
* Only add "power" once for each
* physical package.
#ifdef CONFIG_NUMA
/* Free memory allocated for various sched_group structures */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+ struct cpumask *nodemask)
{
int cpu, i;
- for_each_cpu_mask_nr(cpu, *cpu_map) {
+ for_each_cpu(cpu, cpu_map) {
struct sched_group **sched_group_nodes
= sched_group_nodes_bycpu[cpu];
for (i = 0; i < nr_node_ids; i++) {
struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ /* FIXME: Use cpumask_of_node */
+ node_to_cpumask_ptr(pnodemask, i);
- *nodemask = node_to_cpumask(i);
- cpus_and(*nodemask, *nodemask, *cpu_map);
- if (cpus_empty(*nodemask))
+ cpus_and(*nodemask, *pnodemask, *cpu_map);
+ if (cpumask_empty(nodemask))
continue;
if (sg == NULL)
}
}
#else /* !CONFIG_NUMA */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+ struct cpumask *nodemask)
{
}
#endif /* CONFIG_NUMA */
WARN_ON(!sd || !sd->groups);
- if (cpu != first_cpu(sd->groups->cpumask))
+ if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
return;
child = sd->child;
SD_INIT_FUNC(MC)
#endif
- /*
- * To minimize stack usage kmalloc room for cpumasks and share the
- * space as the usage in build_sched_domains() dictates. Used only
- * if the amount of space is significant.
- */
- struct allmasks {
- cpumask_t tmpmask; /* make this one first */
- union {
- cpumask_t nodemask;
- cpumask_t this_sibling_map;
- cpumask_t this_core_map;
- };
- cpumask_t send_covered;
-
- #ifdef CONFIG_NUMA
- cpumask_t domainspan;
- cpumask_t covered;
- cpumask_t notcovered;
- #endif
- };
-
- #if NR_CPUS > 128
- #define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- {
- *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
- }
- static inline void sched_cpumask_free(struct allmasks *masks)
- {
- kfree(masks);
- }
- #else
- #define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- { }
- static inline void sched_cpumask_free(struct allmasks *masks)
- { }
- #endif
-
- #define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
- ((unsigned long)(a) + offsetof(struct allmasks, v))
-
static int default_relax_domain_level = -1;
static int __init setup_relax_domain_level(char *str)
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
- static int __build_sched_domains(const cpumask_t *cpu_map,
+ static int __build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
- int i;
+ int i, err = -ENOMEM;
struct root_domain *rd;
- SCHED_CPUMASK_DECLARE(allmasks);
- cpumask_t *tmpmask;
+ cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+ tmpmask;
#ifdef CONFIG_NUMA
+ cpumask_var_t domainspan, covered, notcovered;
struct sched_group **sched_group_nodes = NULL;
int sd_allnodes = 0;
+ if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+ goto out;
+ if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+ goto free_domainspan;
+ if (!alloc_cpumask_var(¬covered, GFP_KERNEL))
+ goto free_covered;
+ #endif
+
+ if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+ goto free_notcovered;
+ if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+ goto free_nodemask;
+ if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+ goto free_this_sibling_map;
+ if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+ goto free_this_core_map;
+ if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ goto free_send_covered;
+
+ #ifdef CONFIG_NUMA
/*
* Allocate the per-node list of sched groups
*/
GFP_KERNEL);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
- return -ENOMEM;
+ goto free_tmpmask;
}
#endif
rd = alloc_rootdomain();
if (!rd) {
printk(KERN_WARNING "Cannot alloc root domain\n");
- #ifdef CONFIG_NUMA
- kfree(sched_group_nodes);
- #endif
- return -ENOMEM;
+ goto free_sched_groups;
}
- /* get space for all scratch cpumask variables */
- sched_cpumask_alloc(&allmasks);
- if (!allmasks) {
- printk(KERN_WARNING "Cannot alloc cpumask array\n");
- kfree(rd);
#ifdef CONFIG_NUMA
- kfree(sched_group_nodes);
- #endif
- return -ENOMEM;
- }
-
- tmpmask = (cpumask_t *)allmasks;
-
-
- #ifdef CONFIG_NUMA
- sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+ sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
#endif
/*
* Set up domains for cpus specified by the cpu_map.
*/
- for_each_cpu_mask_nr(i, *cpu_map) {
+ for_each_cpu(i, cpu_map) {
struct sched_domain *sd = NULL, *p;
- SCHED_CPUMASK_VAR(nodemask, allmasks);
+ /* FIXME: use cpumask_of_node */
*nodemask = node_to_cpumask(cpu_to_node(i));
cpus_and(*nodemask, *nodemask, *cpu_map);
#ifdef CONFIG_NUMA
- if (cpus_weight(*cpu_map) >
- SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+ if (cpumask_weight(cpu_map) >
+ SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
sd = &per_cpu(allnodes_domains, i);
SD_INIT(sd, ALLNODES);
set_domain_attribute(sd, attr);
- sd->span = *cpu_map;
+ cpumask_copy(sched_domain_span(sd), cpu_map);
cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
p = sd;
sd_allnodes = 1;
sd = &per_cpu(node_domains, i);
SD_INIT(sd, NODE);
set_domain_attribute(sd, attr);
- sched_domain_node_span(cpu_to_node(i), &sd->span);
+ sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
sd->parent = p;
if (p)
p->child = sd;
- cpus_and(sd->span, sd->span, *cpu_map);
+ cpumask_and(sched_domain_span(sd),
+ sched_domain_span(sd), cpu_map);
#endif
p = sd;
- sd = &per_cpu(phys_domains, i);
+ sd = &per_cpu(phys_domains, i).sd;
SD_INIT(sd, CPU);
set_domain_attribute(sd, attr);
- sd->span = *nodemask;
+ cpumask_copy(sched_domain_span(sd), nodemask);
sd->parent = p;
if (p)
p->child = sd;
#ifdef CONFIG_SCHED_MC
p = sd;
- sd = &per_cpu(core_domains, i);
+ sd = &per_cpu(core_domains, i).sd;
SD_INIT(sd, MC);
set_domain_attribute(sd, attr);
- sd->span = cpu_coregroup_map(i);
- cpus_and(sd->span, sd->span, *cpu_map);
+ *sched_domain_span(sd) = cpu_coregroup_map(i);
+ cpumask_and(sched_domain_span(sd),
+ sched_domain_span(sd), cpu_map);
sd->parent = p;
p->child = sd;
cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
#ifdef CONFIG_SCHED_SMT
p = sd;
- sd = &per_cpu(cpu_domains, i);
+ sd = &per_cpu(cpu_domains, i).sd;
SD_INIT(sd, SIBLING);
set_domain_attribute(sd, attr);
- sd->span = per_cpu(cpu_sibling_map, i);
- cpus_and(sd->span, sd->span, *cpu_map);
+ cpumask_and(sched_domain_span(sd),
+ &per_cpu(cpu_sibling_map, i), cpu_map);
sd->parent = p;
p->child = sd;
cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
- for_each_cpu_mask_nr(i, *cpu_map) {
- SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
- SCHED_CPUMASK_VAR(send_covered, allmasks);
-
- *this_sibling_map = per_cpu(cpu_sibling_map, i);
- cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
- if (i != first_cpu(*this_sibling_map))
+ for_each_cpu(i, cpu_map) {
+ cpumask_and(this_sibling_map,
+ &per_cpu(cpu_sibling_map, i), cpu_map);
+ if (i != cpumask_first(this_sibling_map))
continue;
init_sched_build_groups(this_sibling_map, cpu_map,
#ifdef CONFIG_SCHED_MC
/* Set up multi-core groups */
- for_each_cpu_mask_nr(i, *cpu_map) {
- SCHED_CPUMASK_VAR(this_core_map, allmasks);
- SCHED_CPUMASK_VAR(send_covered, allmasks);
-
+ for_each_cpu(i, cpu_map) {
+ /* FIXME: Use cpu_coregroup_mask */
*this_core_map = cpu_coregroup_map(i);
cpus_and(*this_core_map, *this_core_map, *cpu_map);
- if (i != first_cpu(*this_core_map))
+ if (i != cpumask_first(this_core_map))
continue;
init_sched_build_groups(this_core_map, cpu_map,
/* Set up physical groups */
for (i = 0; i < nr_node_ids; i++) {
- SCHED_CPUMASK_VAR(nodemask, allmasks);
- SCHED_CPUMASK_VAR(send_covered, allmasks);
-
+ /* FIXME: Use cpumask_of_node */
*nodemask = node_to_cpumask(i);
cpus_and(*nodemask, *nodemask, *cpu_map);
- if (cpus_empty(*nodemask))
+ if (cpumask_empty(nodemask))
continue;
init_sched_build_groups(nodemask, cpu_map,
#ifdef CONFIG_NUMA
/* Set up node groups */
if (sd_allnodes) {
- SCHED_CPUMASK_VAR(send_covered, allmasks);
-
init_sched_build_groups(cpu_map, cpu_map,
&cpu_to_allnodes_group,
send_covered, tmpmask);
for (i = 0; i < nr_node_ids; i++) {
/* Set up node groups */
struct sched_group *sg, *prev;
- SCHED_CPUMASK_VAR(nodemask, allmasks);
- SCHED_CPUMASK_VAR(domainspan, allmasks);
- SCHED_CPUMASK_VAR(covered, allmasks);
int j;
+ /* FIXME: Use cpumask_of_node */
*nodemask = node_to_cpumask(i);
- cpus_clear(*covered);
+ cpumask_clear(covered);
cpus_and(*nodemask, *nodemask, *cpu_map);
- if (cpus_empty(*nodemask)) {
+ if (cpumask_empty(nodemask)) {
sched_group_nodes[i] = NULL;
continue;
}
sched_domain_node_span(i, domainspan);
- cpus_and(*domainspan, *domainspan, *cpu_map);
+ cpumask_and(domainspan, domainspan, cpu_map);
- sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+ sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, i);
if (!sg) {
printk(KERN_WARNING "Can not alloc domain group for "
"node %d\n", i);
goto error;
}
sched_group_nodes[i] = sg;
- for_each_cpu_mask_nr(j, *nodemask) {
+ for_each_cpu(j, nodemask) {
struct sched_domain *sd;
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
sg->__cpu_power = 0;
- sg->cpumask = *nodemask;
+ cpumask_copy(sched_group_cpus(sg), nodemask);
sg->next = sg;
- cpus_or(*covered, *covered, *nodemask);
+ cpumask_or(covered, covered, nodemask);
prev = sg;
for (j = 0; j < nr_node_ids; j++) {
- SCHED_CPUMASK_VAR(notcovered, allmasks);
int n = (i + j) % nr_node_ids;
+ /* FIXME: Use cpumask_of_node */
node_to_cpumask_ptr(pnodemask, n);
- cpus_complement(*notcovered, *covered);
- cpus_and(*tmpmask, *notcovered, *cpu_map);
- cpus_and(*tmpmask, *tmpmask, *domainspan);
- if (cpus_empty(*tmpmask))
+ cpumask_complement(notcovered, covered);
+ cpumask_and(tmpmask, notcovered, cpu_map);
+ cpumask_and(tmpmask, tmpmask, domainspan);
+ if (cpumask_empty(tmpmask))
break;
- cpus_and(*tmpmask, *tmpmask, *pnodemask);
- if (cpus_empty(*tmpmask))
+ cpumask_and(tmpmask, tmpmask, pnodemask);
+ if (cpumask_empty(tmpmask))
continue;
- sg = kmalloc_node(sizeof(struct sched_group),
+ sg = kmalloc_node(sizeof(struct sched_group) +
+ cpumask_size(),
GFP_KERNEL, i);
if (!sg) {
printk(KERN_WARNING
goto error;
}
sg->__cpu_power = 0;
- sg->cpumask = *tmpmask;
+ cpumask_copy(sched_group_cpus(sg), tmpmask);
sg->next = prev->next;
- cpus_or(*covered, *covered, *tmpmask);
+ cpumask_or(covered, covered, tmpmask);
prev->next = sg;
prev = sg;
}
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
- for_each_cpu_mask_nr(i, *cpu_map) {
- struct sched_domain *sd = &per_cpu(cpu_domains, i);
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
init_sched_groups_power(i, sd);
}
#endif
#ifdef CONFIG_SCHED_MC
- for_each_cpu_mask_nr(i, *cpu_map) {
- struct sched_domain *sd = &per_cpu(core_domains, i);
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain *sd = &per_cpu(core_domains, i).sd;
init_sched_groups_power(i, sd);
}
#endif
- for_each_cpu_mask_nr(i, *cpu_map) {
- struct sched_domain *sd = &per_cpu(phys_domains, i);
+ for_each_cpu(i, cpu_map) {
+ struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
init_sched_groups_power(i, sd);
}
if (sd_allnodes) {
struct sched_group *sg;
- cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+ cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
tmpmask);
init_numa_sched_groups_power(sg);
}
#endif
/* Attach the domains */
- for_each_cpu_mask_nr(i, *cpu_map) {
+ for_each_cpu(i, cpu_map) {
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i);
+ sd = &per_cpu(cpu_domains, i).sd;
#elif defined(CONFIG_SCHED_MC)
- sd = &per_cpu(core_domains, i);
+ sd = &per_cpu(core_domains, i).sd;
#else
- sd = &per_cpu(phys_domains, i);
+ sd = &per_cpu(phys_domains, i).sd;
#endif
cpu_attach_domain(sd, rd, i);
}
- sched_cpumask_free(allmasks);
- return 0;
+ err = 0;
+
+ free_tmpmask:
+ free_cpumask_var(tmpmask);
+ free_send_covered:
+ free_cpumask_var(send_covered);
+ free_this_core_map:
+ free_cpumask_var(this_core_map);
+ free_this_sibling_map:
+ free_cpumask_var(this_sibling_map);
+ free_nodemask:
+ free_cpumask_var(nodemask);
+ free_notcovered:
+ #ifdef CONFIG_NUMA
+ free_cpumask_var(notcovered);
+ free_covered:
+ free_cpumask_var(covered);
+ free_domainspan:
+ free_cpumask_var(domainspan);
+ out:
+ #endif
+ return err;
+
+ free_sched_groups:
+ #ifdef CONFIG_NUMA
+ kfree(sched_group_nodes);
+ #endif
+ goto free_tmpmask;
#ifdef CONFIG_NUMA
error:
free_sched_groups(cpu_map, tmpmask);
- sched_cpumask_free(allmasks);
- kfree(rd);
- return -ENOMEM;
+ free_rootdomain(rd);
+ goto free_tmpmask;
#endif
}
- static int build_sched_domains(const cpumask_t *cpu_map)
+ static int build_sched_domains(const struct cpumask *cpu_map)
{
return __build_sched_domains(cpu_map, NULL);
}
- static cpumask_t *doms_cur; /* current sched domains */
+ static struct cpumask *doms_cur; /* current sched domains */
static int ndoms_cur; /* number of sched domains in 'doms_cur' */
static struct sched_domain_attr *dattr_cur;
/* attribues of custom domains in 'doms_cur' */
/*
* Special case: If a kmalloc of a doms_cur partition (array of
- * cpumask_t) fails, then fallback to a single sched domain,
- * as determined by the single cpumask_t fallback_doms.
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
*/
- static cpumask_t fallback_doms;
+ static cpumask_var_t fallback_doms;
/*
* arch_update_cpu_topology lets virtualized architectures update the
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
*/
- static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ static int arch_init_sched_domains(const struct cpumask *cpu_map)
{
int err;
arch_update_cpu_topology();
ndoms_cur = 1;
- doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
if (!doms_cur)
- doms_cur = &fallback_doms;
- cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+ doms_cur = fallback_doms;
+ cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
dattr_cur = NULL;
err = build_sched_domains(doms_cur);
register_sched_domain_sysctl();
return err;
}
- static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
- cpumask_t *tmpmask)
+ static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+ struct cpumask *tmpmask)
{
free_sched_groups(cpu_map, tmpmask);
}
* Detach sched domains from a group of cpus specified in cpu_map
* These cpus will now be attached to the NULL domain
*/
- static void detach_destroy_domains(const cpumask_t *cpu_map)
+ static void detach_destroy_domains(const struct cpumask *cpu_map)
{
- cpumask_t tmpmask;
+ /* Save because hotplug lock held. */
+ static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
int i;
- for_each_cpu_mask_nr(i, *cpu_map)
+ for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
synchronize_sched();
- arch_destroy_sched_domains(cpu_map, &tmpmask);
+ arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
}
/* handle null as "default" */
* doms_new[] to the current sched domain partitioning, doms_cur[].
* It destroys each deleted domain and builds each new domain.
*
- * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+ * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
* The masks don't intersect (don't overlap.) We should setup one
* sched domain for each mask. CPUs not in any of the cpumasks will
* not be load balanced. If the same cpumask appears both in the
* the single partition 'fallback_doms', it also forces the domains
* to be rebuilt.
*
- * If doms_new == NULL it will be replaced with cpu_online_map.
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
* ndoms_new == 0 is a special case for destroying existing domains,
* and it will not create the default domain.
*
* Call with hotplug lock held
*/
- void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ /* FIXME: Change to struct cpumask *doms_new[] */
+ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
struct sched_domain_attr *dattr_new)
{
int i, j, n;
/* Destroy deleted domains */
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
- if (cpus_equal(doms_cur[i], doms_new[j])
+ if (cpumask_equal(&doms_cur[i], &doms_new[j])
&& dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
}
if (doms_new == NULL) {
ndoms_cur = 0;
- doms_new = &fallback_doms;
- cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+ doms_new = fallback_doms;
+ cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
WARN_ON_ONCE(dattr_new);
}
/* Build new domains */
for (i = 0; i < ndoms_new; i++) {
for (j = 0; j < ndoms_cur && !new_topology; j++) {
- if (cpus_equal(doms_new[i], doms_cur[j])
+ if (cpumask_equal(&doms_new[i], &doms_cur[j])
&& dattrs_equal(dattr_new, i, dattr_cur, j))
goto match2;
}
}
/* Remember the new sched domains */
- if (doms_cur != &fallback_doms)
+ if (doms_cur != fallback_doms)
kfree(doms_cur);
kfree(dattr_cur); /* kfree(NULL) is safe */
doms_cur = doms_new;
static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
{
int ret;
+ unsigned int level = 0;
- if (buf[0] != '0' && buf[0] != '1')
+ if (sscanf(buf, "%u", &level) != 1)
+ return -EINVAL;
+
+ /*
+ * level is always be positive so don't check for
+ * level < POWERSAVINGS_BALANCE_NONE which is 0
+ * What happens on 0 or 1 byte write,
+ * need to check for count as well?
+ */
+
+ if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
return -EINVAL;
if (smt)
- sched_smt_power_savings = (buf[0] == '1');
+ sched_smt_power_savings = level;
else
- sched_mc_power_savings = (buf[0] == '1');
+ sched_mc_power_savings = level;
ret = arch_reinit_sched_domains();
void __init sched_init_smp(void)
{
- cpumask_t non_isolated_cpus;
+ cpumask_var_t non_isolated_cpus;
+
+ alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
#if defined(CONFIG_NUMA)
sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
- arch_init_sched_domains(&cpu_online_map);
- cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
- if (cpus_empty(non_isolated_cpus))
- cpu_set(smp_processor_id(), non_isolated_cpus);
+ arch_init_sched_domains(cpu_online_mask);
+ cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+ if (cpumask_empty(non_isolated_cpus))
+ cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
put_online_cpus();
init_hrtick();
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
BUG();
sched_init_granularity();
+ free_cpumask_var(non_isolated_cpus);
+
+ alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ init_sched_rt_class();
}
#else
void __init sched_init_smp(void)
*/
current->sched_class = &fair_sched_class;
+ /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+ alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+ alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ #endif
+ alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ #endif /* SMP */
+
scheduler_running = 1;
}
kfree(ca);
}
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
/* return total cpu usage (in nanoseconds) of a group */
static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
{
u64 totalcpuusage = 0;
int i;
- for_each_possible_cpu(i) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
- /*
- * Take rq->lock to make 64-bit addition safe on 32-bit
- * platforms.
- */
- spin_lock_irq(&cpu_rq(i)->lock);
- totalcpuusage += *cpuusage;
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
return totalcpuusage;
}
goto out;
}
- for_each_possible_cpu(i) {
- u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
- spin_lock_irq(&cpu_rq(i)->lock);
- *cpuusage = 0;
- spin_unlock_irq(&cpu_rq(i)->lock);
- }
out:
return err;
}
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct cpuacct *ca = cgroup_ca(cgroup);
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "usage",
.read_u64 = cpuusage_read,
.write_u64 = cpuusage_write,
},
+ {
+ .name = "usage_percpu",
+ .read_seq_string = cpuacct_percpu_seq_read,
+ },
+
};
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
* overflow on 32 bits):
*/
delta_exec = (unsigned long)(now - curr->exec_start);
+ if (!delta_exec)
+ return;
__update_curr(cfs_rq, curr, delta_exec);
curr->exec_start = now;
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
* Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (cpu_active_map)
+ * hence we need to mask them out (cpu_active_mask)
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
- cpumask_t tmp;
struct sched_domain *sd;
int i;
+ unsigned int chosen_wakeup_cpu;
+ int this_cpu;
+
+ /*
+ * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
+ * are idle and this is not a kernel thread and this task's affinity
+ * allows it to be moved to preferred cpu, then just move!
+ */
+
+ this_cpu = smp_processor_id();
+ chosen_wakeup_cpu =
+ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
+
+ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
+ idle_cpu(cpu) && idle_cpu(this_cpu) &&
+ p->mm && !(p->flags & PF_KTHREAD) &&
+ cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
+ return chosen_wakeup_cpu;
/*
* If it is idle, then it is the best cpu to run this task.
if ((sd->flags & SD_WAKE_IDLE)
|| ((sd->flags & SD_WAKE_IDLE_FAR)
&& !task_hot(p, task_rq(p)->clock, sd))) {
- cpus_and(tmp, sd->span, p->cpus_allowed);
- cpus_and(tmp, tmp, cpu_active_map);
- for_each_cpu_mask_nr(i, tmp) {
- if (idle_cpu(i)) {
+ for_each_cpu_and(i, sched_domain_span(sd),
+ &p->cpus_allowed) {
+ if (cpu_active(i) && idle_cpu(i)) {
if (i != task_cpu(p)) {
schedstat_inc(p,
se.nr_wakeups_idle);
* this_cpu and prev_cpu are present in:
*/
for_each_domain(this_cpu, sd) {
- if (cpu_isset(prev_cpu, sd->span)) {
+ if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
this_sd = sd;
break;
}
}
- if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
goto out;
/*
{
struct task_struct *curr = rq->curr;
struct sched_entity *se = &curr->se, *pse = &p->se;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- if (unlikely(rt_prio(p->prio))) {
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ update_curr(cfs_rq);
- update_rq_clock(rq);
- update_curr(cfs_rq);
+ if (unlikely(rt_prio(p->prio))) {
resched_task(curr);
return;
}
if (!rq->online)
return;
- cpu_set(rq->cpu, rq->rd->rto_mask);
+ cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
/*
* Make sure the mask is visible before we set
* the overload count. That is checked to determine
/* the order here really doesn't matter */
atomic_dec(&rq->rd->rto_count);
- cpu_clear(rq->cpu, rq->rd->rto_mask);
+ cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
static void update_rt_migration(struct rq *rq)
}
#define for_each_leaf_rt_rq(rt_rq, rq) \
- list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+ list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
}
#ifdef CONFIG_SMP
- static inline cpumask_t sched_rt_period_mask(void)
+ static inline const struct cpumask *sched_rt_period_mask(void)
{
return cpu_rq(smp_processor_id())->rd->span;
}
#else
- static inline cpumask_t sched_rt_period_mask(void)
+ static inline const struct cpumask *sched_rt_period_mask(void)
{
- return cpu_online_map;
+ return cpu_online_mask;
}
#endif
return rt_rq->rt_throttled;
}
- static inline cpumask_t sched_rt_period_mask(void)
+ static inline const struct cpumask *sched_rt_period_mask(void)
{
- return cpu_online_map;
+ return cpu_online_mask;
}
static inline
int i, weight, more = 0;
u64 rt_period;
- weight = cpus_weight(rd->span);
+ weight = cpumask_weight(rd->span);
spin_lock(&rt_b->rt_runtime_lock);
rt_period = ktime_to_ns(rt_b->rt_period);
- for_each_cpu_mask_nr(i, rd->span) {
+ for_each_cpu(i, rd->span) {
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
s64 diff;
/*
* Greedy reclaim, take back as much as we can.
*/
- for_each_cpu_mask(i, rd->span) {
+ for_each_cpu(i, rd->span) {
struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
s64 diff;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
int i, idle = 1;
- cpumask_t span;
+ const struct cpumask *span;
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return 1;
span = sched_rt_period_mask();
- for_each_cpu_mask(i, span) {
+ for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- cpumask_t mask;
+ cpumask_var_t mask;
if (rq->curr->rt.nr_cpus_allowed == 1)
return;
- if (p->rt.nr_cpus_allowed != 1
- && cpupri_find(&rq->rd->cpupri, p, &mask))
+ if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
- return;
+ if (p->rt.nr_cpus_allowed != 1
+ && cpupri_find(&rq->rd->cpupri, p, mask))
+ goto free;
+
+ if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
+ goto free;
/*
* There appears to be other cpus that can accept
*/
requeue_task_rt(rq, p, 1);
resched_task(rq->curr);
+ free:
+ free_cpumask_var(mask);
}
#endif /* CONFIG_SMP */
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
- (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+ (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
(p->rt.nr_cpus_allowed > 1))
return 1;
return 0;
return next;
}
- static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
{
static int find_lowest_rq(struct task_struct *task)
{
struct sched_domain *sd;
- cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+ struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
* I guess we might want to change cpupri_find() to ignore those
* in the first place.
*/
- cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+ cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/*
* At this point we have built a mask of cpus representing the
* We prioritize the last cpu that the task executed on since
* it is most likely cache-hot in that location.
*/
- if (cpu_isset(cpu, *lowest_mask))
+ if (cpumask_test_cpu(cpu, lowest_mask))
return cpu;
/*
cpumask_t domain_mask;
int best_cpu;
- cpus_and(domain_mask, sd->span, *lowest_mask);
+ cpumask_and(&domain_mask, sched_domain_span(sd),
+ lowest_mask);
best_cpu = pick_optimal_cpu(this_cpu,
&domain_mask);
* Also make sure that it wasn't scheduled on its rq.
*/
if (unlikely(task_rq(task) != rq ||
- !cpu_isset(lowest_rq->cpu,
- task->cpus_allowed) ||
+ !cpumask_test_cpu(lowest_rq->cpu,
+ &task->cpus_allowed) ||
task_running(rq, task) ||
!task->se.on_rq)) {
next = pick_next_task_rt(this_rq);
- for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
+ for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
}
static void set_cpus_allowed_rt(struct task_struct *p,
- const cpumask_t *new_mask)
+ const struct cpumask *new_mask)
{
- int weight = cpus_weight(*new_mask);
+ int weight = cpumask_weight(new_mask);
BUG_ON(!rt_task(p));
update_rt_migration(rq);
}
- p->cpus_allowed = *new_mask;
+ cpumask_copy(&p->cpus_allowed, new_mask);
p->rt.nr_cpus_allowed = weight;
}
if (!rq->rt.rt_nr_running)
pull_rt_task(rq);
}
+
+ static inline void init_sched_rt_class(void)
+ {
+ unsigned int i;
+
+ for_each_possible_cpu(i)
+ alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
+ }
#endif /* CONFIG_SMP */
/*
rcu_read_unlock();
}
#endif /* CONFIG_SCHED_DEBUG */
+
rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
rq->sched_switch, rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
- rq->rq_sched_info.cpu_time,
+ rq->rq_cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
seq_printf(seq, "\n");
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- cpumask_scnprintf(mask_str, mask_len, sd->span);
+ cpumask_scnprintf(mask_str, mask_len,
+ sched_domain_span(sd));
seq_printf(seq, "domain%d %s", dcount++, mask_str);
for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
itype++) {
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{
if (rq)
- rq->rq_sched_info.cpu_time += delta;
+ rq->rq_cpu_time += delta;
}
static inline void
unsigned long long delta = task_rq(t)->clock -
t->sched_info.last_arrival;
- t->sched_info.cpu_time += delta;
rq_sched_info_depart(task_rq(t), delta);
if (t->state == TASK_RUNNING)
if (!ts->tick_stopped)
return;
- cpu_clear(cpu, nohz_cpu_mask);
+ cpumask_clear_cpu(cpu, nohz_cpu_mask);
now = ktime_get();
ts->idle_waketime = now;
if (need_resched())
goto end;
- if (unlikely(local_softirq_pending())) {
+ if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
static int ratelimit;
if (ratelimit < 10) {
/* Schedule the tick, if we are at least one jiffie off */
if ((long)delta_jiffies >= 1) {
+ /*
+ * calculate the expiry time for the next timer wheel
+ * timer
+ */
+ expires = ktime_add_ns(last_update, tick_period.tv64 *
+ delta_jiffies);
+
+ /*
+ * If this cpu is the one which updates jiffies, then
+ * give up the assignment and let it be taken by the
+ * cpu which runs the tick timer next, which might be
+ * this cpu as well. If we don't drop this here the
+ * jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+
if (delta_jiffies > 1)
- cpu_set(cpu, nohz_cpu_mask);
+ cpumask_set_cpu(cpu, nohz_cpu_mask);
+
+ /* Skip reprogram of event if its not changed */
+ if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+ goto out;
+
/*
* nohz_stop_sched_tick can be called several times before
* the nohz_restart_sched_tick is called. This happens when
/*
* sched tick not stopped!
*/
- cpu_clear(cpu, nohz_cpu_mask);
+ cpumask_clear_cpu(cpu, nohz_cpu_mask);
goto out;
}
rcu_enter_nohz();
}
- /*
- * If this cpu is the one which updates jiffies, then
- * give up the assignment and let it be taken by the
- * cpu which runs the tick timer next, which might be
- * this cpu as well. If we don't drop this here the
- * jiffies might be stale and do_timer() never
- * invoked.
- */
- if (cpu == tick_do_timer_cpu)
- tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-
ts->idle_sleeps++;
/*
goto out;
}
- /*
- * calculate the expiry time for the next timer wheel
- * timer
- */
- expires = ktime_add_ns(last_update, tick_period.tv64 *
- delta_jiffies);
+ /* Mark expiries */
ts->idle_expires = expires;
if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
* softirq.
*/
tick_do_update_jiffies64(ktime_get());
- cpu_clear(cpu, nohz_cpu_mask);
+ cpumask_clear_cpu(cpu, nohz_cpu_mask);
}
raise_softirq_irqoff(TIMER_SOFTIRQ);
out:
select_nohz_load_balancer(0);
now = ktime_get();
tick_do_update_jiffies64(now);
- cpu_clear(cpu, nohz_cpu_mask);
+ cpumask_clear_cpu(cpu, nohz_cpu_mask);
/*
* We stopped the tick in idle. Update process times would miss the
*/
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
ts->sched_timer.function = tick_sched_timer;
- ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/kprobes.h>
-#include <linux/seq_file.h>
#include <linux/writeback.h>
#include <linux/stacktrace.h>
"annotate",
"userstacktrace",
"sym-userobj",
+ "printk-msg-only",
NULL
};
memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
data->pid = tsk->pid;
- data->uid = tsk->uid;
+ data->uid = task_uid(tsk);
data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
data->policy = tsk->policy;
data->rt_priority = tsk->rt_priority;
ftrace_enable_cpu();
}
+void tracing_reset_online_cpus(struct trace_array *tr)
+{
+ int cpu;
+
+ tr->time_start = ftrace_now(tr->cpu);
+
+ for_each_online_cpu(cpu)
+ tracing_reset(tr, cpu);
+}
+
#define SAVED_CMDLINES 128
static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
TRACE_FILE_ANNOTATE = 2,
};
-static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
+static void trace_iterator_increment(struct trace_iterator *iter)
{
/* Don't allow ftrace to trace into the ring buffers */
ftrace_disable_cpu();
iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
if (iter->ent)
- trace_iterator_increment(iter, iter->cpu);
+ trace_iterator_increment(iter);
return iter->ent ? iter : NULL;
}
static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+static int task_state_char(unsigned long state)
+{
+ int bit = state ? __ffs(state) + 1 : 0;
+
+ return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
/*
* The message is supposed to contain an ending newline.
* If the printing stops prematurely, try to add a newline of our own.
char *comm;
int S, T;
int i;
- unsigned state;
if (entry->type == TRACE_CONT)
return TRACE_TYPE_HANDLED;
trace_assign_type(field, entry);
- T = field->next_state < sizeof(state_to_char) ?
- state_to_char[field->next_state] : 'X';
-
- state = field->prev_state ?
- __ffs(field->prev_state) + 1 : 0;
- S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
+ T = task_state_char(field->next_state);
+ S = task_state_char(field->prev_state);
comm = trace_find_cmdline(field->next_pid);
trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
field->prev_pid,
trace_assign_type(field, entry);
- S = field->prev_state < sizeof(state_to_char) ?
- state_to_char[field->prev_state] : 'X';
- T = field->next_state < sizeof(state_to_char) ?
- state_to_char[field->next_state] : 'X';
+ T = task_state_char(field->next_state);
+ S = task_state_char(field->prev_state);
ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
field->prev_pid,
field->prev_prio,
trace_assign_type(field, entry);
- S = field->prev_state < sizeof(state_to_char) ?
- state_to_char[field->prev_state] : 'X';
- T = field->next_state < sizeof(state_to_char) ?
- state_to_char[field->next_state] : 'X';
- if (entry->type == TRACE_WAKE)
- S = '+';
+ T = task_state_char(field->next_state);
+ S = entry->type == TRACE_WAKE ? '+' :
+ task_state_char(field->prev_state);
ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
trace_assign_type(field, entry);
- S = field->prev_state < sizeof(state_to_char) ?
- state_to_char[field->prev_state] : 'X';
- T = field->next_state < sizeof(state_to_char) ?
- state_to_char[field->next_state] : 'X';
- if (entry->type == TRACE_WAKE)
- S = '+';
+ T = task_state_char(field->next_state);
+ S = entry->type == TRACE_WAKE ? '+' :
+ task_state_char(field->prev_state);
SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
SEQ_PUT_HEX_FIELD_RET(s, S);
return TRACE_TYPE_HANDLED;
}
+static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct print_entry *field;
+ int ret;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_printf(s, field->buf);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (entry->flags & TRACE_FLAG_CONT)
+ trace_seq_print_cont(s, iter);
+
+ return TRACE_TYPE_HANDLED;
+}
+
static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
return ret;
}
+ if (iter->ent->type == TRACE_PRINT &&
+ trace_flags & TRACE_ITER_PRINTK &&
+ trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ return print_printk_msg_only(iter);
+
if (trace_flags & TRACE_ITER_BIN)
return print_bin_fmt(iter);
/* Notify the tracer early; before we stop tracing. */
if (iter->trace && iter->trace->open)
- iter->trace->open(iter);
+ iter->trace->open(iter);
/* Annotate start of buffers if we had overruns */
if (ring_buffer_overruns(iter->tr->buffer))
mutex_lock(&tracing_cpumask_update_lock);
- len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
+ len = cpumask_scnprintf(mask_str, count, &tracing_cpumask);
if (count - len < 2) {
count = -EINVAL;
goto out_err;
int err, cpu;
mutex_lock(&tracing_cpumask_update_lock);
- err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
+ err = cpumask_parse_user(ubuf, count, &tracing_cpumask_new);
if (err)
goto err_unlock;
config LIBCRC32C
tristate "CRC32c (Castagnoli, et al) Cyclic Redundancy-Check"
+ select CRYPTO
+ select CRYPTO_CRC32C
help
This option is provided for the case where no in-kernel-tree
modules require CRC32c functions, but a module built outside the
config HAVE_LMB
boolean
+ config CPUMASK_OFFSTACK
+ bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
+ help
+ Use dynamic allocation for cpumask_var_t, instead of putting
+ them on the stack. This is a bit more expensive, but avoids
+ stack overflow.
+
endmenu
#include <linux/kallsyms.h>
#include <linux/memory.h>
#include <linux/math64.h>
+#include <linux/fault-inject.h>
/*
* Lock order:
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif
+#define OO_SHIFT 16
+#define OO_MASK ((1 << OO_SHIFT) - 1)
+#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */
+
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
* Tracking user of a slab.
*/
struct track {
- void *addr; /* Called from address */
+ unsigned long addr; /* Called from address */
int cpu; /* Was running on cpu */
int pid; /* Pid context */
unsigned long when; /* When did the operation occur */
unsigned long size)
{
struct kmem_cache_order_objects x = {
- (order << 16) + (PAGE_SIZE << order) / size
+ (order << OO_SHIFT) + (PAGE_SIZE << order) / size
};
return x;
static inline int oo_order(struct kmem_cache_order_objects x)
{
- return x.x >> 16;
+ return x.x >> OO_SHIFT;
}
static inline int oo_objects(struct kmem_cache_order_objects x)
{
- return x.x & ((1 << 16) - 1);
+ return x.x & OO_MASK;
}
#ifdef CONFIG_SLUB_DEBUG
}
static void set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, void *addr)
+ enum track_item alloc, unsigned long addr)
{
struct track *p;
if (!(s->flags & SLAB_STORE_USER))
return;
- set_track(s, object, TRACK_FREE, NULL);
- set_track(s, object, TRACK_ALLOC, NULL);
+ set_track(s, object, TRACK_FREE, 0UL);
+ set_track(s, object, TRACK_ALLOC, 0UL);
}
static void print_track(const char *s, struct track *t)
return;
printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
- s, t->addr, jiffies - t->when, t->cpu, t->pid);
+ s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
}
static void print_tracking(struct kmem_cache *s, void *object)
if (!check_valid_pointer(s, page, get_freepointer(s, p))) {
object_err(s, page, p, "Freepointer corrupt");
/*
- * No choice but to zap it and thus loose the remainder
+ * No choice but to zap it and thus lose the remainder
* of the free objects in this slab. May cause
* another error because the object count is now wrong.
*/
}
max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
- if (max_objects > 65535)
- max_objects = 65535;
+ if (max_objects > MAX_OBJS_PER_PAGE)
+ max_objects = MAX_OBJS_PER_PAGE;
if (page->objects != max_objects) {
slab_err(s, page, "Wrong number of objects. Found %d but "
}
static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
- void *object, void *addr)
+ void *object, unsigned long addr)
{
if (!check_slab(s, page))
goto bad;
}
static int free_debug_processing(struct kmem_cache *s, struct page *page,
- void *object, void *addr)
+ void *object, unsigned long addr)
{
if (!check_slab(s, page))
goto fail;
struct page *page, void *object) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, void *addr) { return 0; }
+ struct page *page, void *object, unsigned long addr) { return 0; }
static inline int free_debug_processing(struct kmem_cache *s,
- struct page *page, void *object, void *addr) { return 0; }
+ struct page *page, void *object, unsigned long addr) { return 0; }
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
* we need to allocate a new slab. This is the slowest path since it involves
* a call to the page allocator and the setup of a new slab.
*/
-static void *__slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
+static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
+ unsigned long addr, struct kmem_cache_cpu *c)
{
void **object;
struct page *new;
* Otherwise we can simply pick the next object from the lockless free list.
*/
static __always_inline void *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, int node, void *addr)
+ gfp_t gfpflags, int node, unsigned long addr)
{
void **object;
struct kmem_cache_cpu *c;
unsigned long flags;
unsigned int objsize;
+ might_sleep_if(gfpflags & __GFP_WAIT);
+
+ if (should_failslab(s->objsize, gfpflags))
+ return NULL;
+
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
objsize = c->objsize;
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- return slab_alloc(s, gfpflags, -1, __builtin_return_address(0));
+ return slab_alloc(s, gfpflags, -1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- return slab_alloc(s, gfpflags, node, __builtin_return_address(0));
+ return slab_alloc(s, gfpflags, node, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
#endif
* handling required then we can return immediately.
*/
static void __slab_free(struct kmem_cache *s, struct page *page,
- void *x, void *addr, unsigned int offset)
+ void *x, unsigned long addr, unsigned int offset)
{
void *prior;
void **object = (void *)x;
* with all sorts of special processing.
*/
static __always_inline void slab_free(struct kmem_cache *s,
- struct page *page, void *x, void *addr)
+ struct page *page, void *x, unsigned long addr)
{
void **object = (void *)x;
struct kmem_cache_cpu *c;
page = virt_to_head_page(x);
- slab_free(s, page, x, __builtin_return_address(0));
+ slab_free(s, page, x, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab object the object resides */
+/* Figure out on which slab page the object resides */
static struct page *get_object_page(const void *x)
{
struct page *page = virt_to_head_page(x);
int rem;
int min_order = slub_min_order;
- if ((PAGE_SIZE << min_order) / size > 65535)
- return get_order(size * 65535) - 1;
+ if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+ return get_order(size * MAX_OBJS_PER_PAGE) - 1;
for (order = max(min_order,
fls(min_objects * size - 1) - PAGE_SHIFT);
* when allocating for the kmalloc_node_cache. This is used for bootstrapping
* memory on a fresh node that has no slab structures yet.
*/
-static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
- int node)
+static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
{
struct page *page;
struct kmem_cache_node *n;
local_irq_save(flags);
add_partial(n, page, 0);
local_irq_restore(flags);
- return n;
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
n = &s->local_node;
else {
if (slab_state == DOWN) {
- n = early_kmem_cache_node_alloc(gfpflags,
- node);
+ early_kmem_cache_node_alloc(gfpflags, node);
continue;
}
n = kmem_cache_alloc_node(kmalloc_caches,
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- return slab_alloc(s, flags, -1, __builtin_return_address(0));
+ return slab_alloc(s, flags, -1, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc);
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- return slab_alloc(s, flags, node, __builtin_return_address(0));
+ return slab_alloc(s, flags, node, _RET_IP_);
}
EXPORT_SYMBOL(__kmalloc_node);
#endif
put_page(page);
return;
}
- slab_free(page->slab, page, object, __builtin_return_address(0));
+ slab_free(page->slab, page, object, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
up_write(&slub_lock);
- if (sysfs_slab_alias(s, name))
+ if (sysfs_slab_alias(s, name)) {
+ down_write(&slub_lock);
+ s->refcount--;
+ up_write(&slub_lock);
goto err;
+ }
return s;
}
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
up_write(&slub_lock);
- if (sysfs_slab_add(s))
+ if (sysfs_slab_add(s)) {
+ down_write(&slub_lock);
+ list_del(&s->list);
+ up_write(&slub_lock);
+ kfree(s);
goto err;
+ }
return s;
}
kfree(s);
#endif
-void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
+void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
{
struct kmem_cache *s;
}
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
- int node, void *caller)
+ int node, unsigned long caller)
{
struct kmem_cache *s;
struct location {
unsigned long count;
- void *addr;
+ unsigned long addr;
long long sum_time;
long min_time;
long max_time;
{
long start, end, pos;
struct location *l;
- void *caddr;
+ unsigned long caddr;
unsigned long age = jiffies - track->when;
start = -1;
len < PAGE_SIZE - 60) {
len += sprintf(buf + len, " cpus=");
len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
- l->cpus);
+ &l->cpus);
}
if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
/*
* Need to buffer aliases during bootup until sysfs becomes
- * available lest we loose that information.
+ * available lest we lose that information.
*/
struct saved_alias {
struct kmem_cache *s;