]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'linus' into timers/core
authorThomas Gleixner <tglx@linutronix.de>
Wed, 24 Apr 2013 18:33:46 +0000 (20:33 +0200)
committerThomas Gleixner <tglx@linutronix.de>
Wed, 24 Apr 2013 18:33:54 +0000 (20:33 +0200)
Reason: Get upstream fixes before adding conflicting code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
1  2 
arch/x86/Kconfig
arch/x86/platform/efi/efi.c
include/linux/sched.h
kernel/hrtimer.c
kernel/time/tick-broadcast.c

diff --combined arch/x86/Kconfig
index 26bd7926153219b01581e932eab565d63c630114,15b5cef4aa3857a386cb77cffd2ba74243b532c7..9f74f523dfc66c84c097351f59d0d0bbe2fdad8f
@@@ -112,7 -112,7 +112,7 @@@ config X8
        select GENERIC_STRNLEN_USER
        select HAVE_CONTEXT_TRACKING if X86_64
        select HAVE_IRQ_TIME_ACCOUNTING
-       select HAVE_VIRT_TO_BUS
+       select VIRT_TO_BUS
        select MODULES_USE_ELF_REL if X86_32
        select MODULES_USE_ELF_RELA if X86_64
        select CLONE_BACKWARDS if X86_32
        select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
        select OLD_SIGACTION if X86_32
        select COMPAT_OLD_SIGACTION if IA32_EMULATION
 +      select RTC_LIB
  
  config INSTRUCTION_DECODER
        def_bool y
@@@ -1550,6 -1549,7 +1550,7 @@@ config X86_SMA
  config EFI
        bool "EFI runtime service support"
        depends on ACPI
+       select UCS2_STRING
        ---help---
          This enables the kernel to use EFI runtime services that are
          available (such as the EFI variable services).
index 28d9efacc9b682999a5463a6dffa9f28fca0580c,e4a86a677ce163ec5f911fb14db65590d813aee1..b55d174e503446fe6cc47ea34df463f5ce171c5f
@@@ -41,6 -41,7 +41,7 @@@
  #include <linux/io.h>
  #include <linux/reboot.h>
  #include <linux/bcd.h>
+ #include <linux/ucs2_string.h>
  
  #include <asm/setup.h>
  #include <asm/efi.h>
  #include <asm/cacheflush.h>
  #include <asm/tlbflush.h>
  #include <asm/x86_init.h>
 +#include <asm/rtc.h>
  
  #define EFI_DEBUG     1
  
+ /*
+  * There's some additional metadata associated with each
+  * variable. Intel's reference implementation is 60 bytes - bump that
+  * to account for potential alignment constraints
+  */
+ #define VAR_METADATA_SIZE 64
  struct efi __read_mostly efi = {
        .mps        = EFI_INVALID_TABLE_ADDR,
        .acpi       = EFI_INVALID_TABLE_ADDR,
@@@ -70,6 -77,13 +78,13 @@@ struct efi_memory_map memmap
  static struct efi efi_phys __initdata;
  static efi_system_table_t efi_systab __initdata;
  
+ static u64 efi_var_store_size;
+ static u64 efi_var_remaining_size;
+ static u64 efi_var_max_var_size;
+ static u64 boot_used_size;
+ static u64 boot_var_size;
+ static u64 active_size;
  unsigned long x86_efi_facility;
  
  /*
@@@ -99,6 -113,15 +114,15 @@@ static int __init setup_add_efi_memmap(
  }
  early_param("add_efi_memmap", setup_add_efi_memmap);
  
+ static bool efi_no_storage_paranoia;
+ static int __init setup_storage_paranoia(char *arg)
+ {
+       efi_no_storage_paranoia = true;
+       return 0;
+ }
+ early_param("efi_no_storage_paranoia", setup_storage_paranoia);
  
  static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
  {
@@@ -163,8 -186,53 +187,53 @@@ static efi_status_t virt_efi_get_next_v
                                               efi_char16_t *name,
                                               efi_guid_t *vendor)
  {
-       return efi_call_virt3(get_next_variable,
-                             name_size, name, vendor);
+       efi_status_t status;
+       static bool finished = false;
+       static u64 var_size;
+       status = efi_call_virt3(get_next_variable,
+                               name_size, name, vendor);
+       if (status == EFI_NOT_FOUND) {
+               finished = true;
+               if (var_size < boot_used_size) {
+                       boot_var_size = boot_used_size - var_size;
+                       active_size += boot_var_size;
+               } else {
+                       printk(KERN_WARNING FW_BUG  "efi: Inconsistent initial sizes\n");
+               }
+       }
+       if (boot_used_size && !finished) {
+               unsigned long size;
+               u32 attr;
+               efi_status_t s;
+               void *tmp;
+               s = virt_efi_get_variable(name, vendor, &attr, &size, NULL);
+               if (s != EFI_BUFFER_TOO_SMALL || !size)
+                       return status;
+               tmp = kmalloc(size, GFP_ATOMIC);
+               if (!tmp)
+                       return status;
+               s = virt_efi_get_variable(name, vendor, &attr, &size, tmp);
+               if (s == EFI_SUCCESS && (attr & EFI_VARIABLE_NON_VOLATILE)) {
+                       var_size += size;
+                       var_size += ucs2_strsize(name, 1024);
+                       active_size += size;
+                       active_size += VAR_METADATA_SIZE;
+                       active_size += ucs2_strsize(name, 1024);
+               }
+               kfree(tmp);
+       }
+       return status;
  }
  
  static efi_status_t virt_efi_set_variable(efi_char16_t *name,
                                          unsigned long data_size,
                                          void *data)
  {
-       return efi_call_virt5(set_variable,
-                             name, vendor, attr,
-                             data_size, data);
+       efi_status_t status;
+       u32 orig_attr = 0;
+       unsigned long orig_size = 0;
+       status = virt_efi_get_variable(name, vendor, &orig_attr, &orig_size,
+                                      NULL);
+       if (status != EFI_BUFFER_TOO_SMALL)
+               orig_size = 0;
+       status = efi_call_virt5(set_variable,
+                               name, vendor, attr,
+                               data_size, data);
+       if (status == EFI_SUCCESS) {
+               if (orig_size) {
+                       active_size -= orig_size;
+                       active_size -= ucs2_strsize(name, 1024);
+                       active_size -= VAR_METADATA_SIZE;
+               }
+               if (data_size) {
+                       active_size += data_size;
+                       active_size += ucs2_strsize(name, 1024);
+                       active_size += VAR_METADATA_SIZE;
+               }
+       }
+       return status;
  }
  
  static efi_status_t virt_efi_query_variable_info(u32 attr,
@@@ -259,10 -352,10 +353,10 @@@ static efi_status_t __init phys_efi_get
  
  int efi_set_rtc_mmss(unsigned long nowtime)
  {
 -      int real_seconds, real_minutes;
        efi_status_t    status;
        efi_time_t      eft;
        efi_time_cap_t  cap;
 +      struct rtc_time tm;
  
        status = efi.get_time(&eft, &cap);
        if (status != EFI_SUCCESS) {
                return -1;
        }
  
 -      real_seconds = nowtime % 60;
 -      real_minutes = nowtime / 60;
 -      if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
 -              real_minutes += 30;
 -      real_minutes %= 60;
 -      eft.minute = real_minutes;
 -      eft.second = real_seconds;
 +      rtc_time_to_tm(nowtime, &tm);
 +      if (!rtc_valid_tm(&tm)) {
 +              eft.year = tm.tm_year + 1900;
 +              eft.month = tm.tm_mon + 1;
 +              eft.day = tm.tm_mday;
 +              eft.minute = tm.tm_min;
 +              eft.second = tm.tm_sec;
 +              eft.nanosecond = 0;
 +      } else {
 +              printk(KERN_ERR
 +                     "%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
 +                     __FUNCTION__, nowtime);
 +              return -1;
 +      }
  
        status = efi.set_time(&eft);
        if (status != EFI_SUCCESS) {
@@@ -690,6 -776,9 +784,9 @@@ void __init efi_init(void
        char vendor[100] = "unknown";
        int i = 0;
        void *tmp;
+       struct setup_data *data;
+       struct efi_var_bootdata *efi_var_data;
+       u64 pa_data;
  
  #ifdef CONFIG_X86_32
        if (boot_params.efi_info.efi_systab_hi ||
        if (efi_systab_init(efi_phys.systab))
                return;
  
+       pa_data = boot_params.hdr.setup_data;
+       while (pa_data) {
+               data = early_ioremap(pa_data, sizeof(*efi_var_data));
+               if (data->type == SETUP_EFI_VARS) {
+                       efi_var_data = (struct efi_var_bootdata *)data;
+                       efi_var_store_size = efi_var_data->store_size;
+                       efi_var_remaining_size = efi_var_data->remaining_size;
+                       efi_var_max_var_size = efi_var_data->max_var_size;
+               }
+               pa_data = data->next;
+               early_iounmap(data, sizeof(*efi_var_data));
+       }
+       boot_used_size = efi_var_store_size - efi_var_remaining_size;
        set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
  
        /*
@@@ -1007,3 -1112,48 +1120,48 @@@ u64 efi_mem_attributes(unsigned long ph
        }
        return 0;
  }
+ /*
+  * Some firmware has serious problems when using more than 50% of the EFI
+  * variable store, i.e. it triggers bugs that can brick machines. Ensure that
+  * we never use more than this safe limit.
+  *
+  * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
+  * store.
+  */
+ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+ {
+       efi_status_t status;
+       u64 storage_size, remaining_size, max_size;
+       status = efi.query_variable_info(attributes, &storage_size,
+                                        &remaining_size, &max_size);
+       if (status != EFI_SUCCESS)
+               return status;
+       if (!max_size && remaining_size > size)
+               printk_once(KERN_ERR FW_BUG "Broken EFI implementation"
+                           " is returning MaxVariableSize=0\n");
+       /*
+        * Some firmware implementations refuse to boot if there's insufficient
+        * space in the variable store. We account for that by refusing the
+        * write if permitting it would reduce the available space to under
+        * 50%. However, some firmware won't reclaim variable space until
+        * after the used (not merely the actively used) space drops below
+        * a threshold. We can approximate that case with the value calculated
+        * above. If both the firmware and our calculations indicate that the
+        * available space would drop below 50%, refuse the write.
+        */
+       if (!storage_size || size > remaining_size ||
+           (max_size && size > max_size))
+               return EFI_OUT_OF_RESOURCES;
+       if (!efi_no_storage_paranoia &&
+           ((active_size + size + VAR_METADATA_SIZE > storage_size / 2) &&
+            (remaining_size - size < storage_size / 2)))
+               return EFI_OUT_OF_RESOURCES;
+       return EFI_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(efi_query_variable_store);
diff --combined include/linux/sched.h
index d13341b550967a0e7fa43663921cf23136fd4685,e692a022527bdaaace8b388268b0c280946fc5c2..78694315c1b4301c89a69d6e1bd74d3f908207e5
@@@ -163,9 -163,10 +163,10 @@@ print_cfs_rq(struct seq_file *m, int cp
  #define TASK_DEAD             64
  #define TASK_WAKEKILL         128
  #define TASK_WAKING           256
- #define TASK_STATE_MAX                512
+ #define TASK_PARKED           512
+ #define TASK_STATE_MAX                1024
  
- #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+ #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
  
  extern char ___assert_task_state[1 - 2*!!(
                sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@@ -526,8 -527,7 +527,8 @@@ struct signal_struct 
        unsigned int            has_child_subreaper:1;
  
        /* POSIX.1b Interval Timers */
 -      struct list_head posix_timers;
 +      int                     posix_timer_id;
 +      struct list_head        posix_timers;
  
        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
diff --combined kernel/hrtimer.c
index c0875ae0de1791239fdc6fadcc1a7f6c8ab09729,14be27feda491da1c3dc9990a5ae80ce649570aa..609d8ff38b745c7d02b76b69be09b82176050bf6
@@@ -63,6 -63,7 +63,7 @@@
  DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
  {
  
+       .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
                        .get_time = &ktime_get_boottime,
                        .resolution = KTIME_LOW_RES,
                },
 +              {
 +                      .index = HRTIMER_BASE_TAI,
 +                      .clockid = CLOCK_TAI,
 +                      .get_time = &ktime_get_clocktai,
 +                      .resolution = KTIME_LOW_RES,
 +              },
        }
  };
  
@@@ -96,7 -91,6 +97,7 @@@ static const int hrtimer_clock_to_base_
        [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
        [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
        [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
 +      [CLOCK_TAI]             = HRTIMER_BASE_TAI,
  };
  
  static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@@ -113,10 -107,8 +114,10 @@@ static void hrtimer_get_softirq_time(st
  {
        ktime_t xtim, mono, boot;
        struct timespec xts, tom, slp;
 +      s32 tai_offset;
  
        get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
 +      tai_offset = timekeeping_get_tai_offset();
  
        xtim = timespec_to_ktime(xts);
        mono = ktime_add(xtim, timespec_to_ktime(tom));
        base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
        base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
        base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
 +      base->clock_base[HRTIMER_BASE_TAI].softirq_time =
 +                              ktime_add(xtim, ktime_set(tai_offset, 0));
  }
  
  /*
@@@ -286,10 -276,6 +287,10 @@@ ktime_t ktime_add_ns(const ktime_t kt, 
        } else {
                unsigned long rem = do_div(nsec, NSEC_PER_SEC);
  
 +              /* Make sure nsec fits into long */
 +              if (unlikely(nsec > KTIME_SEC_MAX))
 +                      return (ktime_t){ .tv64 = KTIME_MAX };
 +
                tmp = ktime_set((long)nsec, rem);
        }
  
@@@ -666,9 -652,8 +667,9 @@@ static inline ktime_t hrtimer_update_ba
  {
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 +      ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
  
 -      return ktime_get_update_offsets(offs_real, offs_boot);
 +      return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
  }
  
  /*
@@@ -1026,8 -1011,7 +1027,8 @@@ int __hrtimer_start_range_ns(struct hrt
   * @timer:    the timer to be added
   * @tim:      expiry time
   * @delta_ns: "slack" range for the timer
 - * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
 + * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
 + *            relative (HRTIMER_MODE_REL)
   *
   * Returns:
   *  0 on success
@@@ -1044,8 -1028,7 +1045,8 @@@ EXPORT_SYMBOL_GPL(hrtimer_start_range_n
   * hrtimer_start - (re)start an hrtimer on the current CPU
   * @timer:    the timer to be added
   * @tim:      expiry time
 - * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
 + * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
 + *            relative (HRTIMER_MODE_REL)
   *
   * Returns:
   *  0 on success
@@@ -1327,8 -1310,6 +1328,8 @@@ retry
  
                                expires = ktime_sub(hrtimer_get_expires(timer),
                                                    base->offset);
 +                              if (expires.tv64 < 0)
 +                                      expires.tv64 = KTIME_MAX;
                                if (expires.tv64 < expires_next.tv64)
                                        expires_next = expires;
                                break;
@@@ -1662,8 -1643,6 +1663,6 @@@ static void __cpuinit init_hrtimers_cpu
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;
  
-       raw_spin_lock_init(&cpu_base->lock);
        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                cpu_base->clock_base[i].cpu_base = cpu_base;
                timerqueue_init_head(&cpu_base->clock_base[i].active);
index f8d2109ef0a25db2a3d4152b4d301e3534ce58ff,7f32fe0e52cd46489c8d90e4b85f9d74204aab16..6e23fde83dbeb21faeca1d5500003cd7e1a373e0
@@@ -28,8 -28,9 +28,8 @@@
   */
  
  static struct tick_device tick_broadcast_device;
 -/* FIXME: Use cpumask_var_t. */
 -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
 -static DECLARE_BITMAP(tmpmask, NR_CPUS);
 +static cpumask_var_t tick_broadcast_mask;
 +static cpumask_var_t tmpmask;
  static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
  static int tick_broadcast_force;
  
@@@ -49,7 -50,7 +49,7 @@@ struct tick_device *tick_get_broadcast_
  
  struct cpumask *tick_get_broadcast_mask(void)
  {
 -      return to_cpumask(tick_broadcast_mask);
 +      return tick_broadcast_mask;
  }
  
  /*
@@@ -66,25 -67,16 +66,26 @@@ static void tick_broadcast_start_period
   */
  int tick_check_broadcast_device(struct clock_event_device *dev)
  {
-       if ((tick_broadcast_device.evtdev &&
+       if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+           (tick_broadcast_device.evtdev &&
             tick_broadcast_device.evtdev->rating >= dev->rating) ||
             (dev->features & CLOCK_EVT_FEAT_C3STOP))
                return 0;
  
        clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
        tick_broadcast_device.evtdev = dev;
 -      if (!cpumask_empty(tick_get_broadcast_mask()))
 +      if (!cpumask_empty(tick_broadcast_mask))
                tick_broadcast_start_periodic(dev);
 +      /*
 +       * Inform all cpus about this. We might be in a situation
 +       * where we did not switch to oneshot mode because the per cpu
 +       * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
 +       * of a oneshot capable broadcast device. Without that
 +       * notification the systems stays stuck in periodic mode
 +       * forever.
 +       */
 +      if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
 +              tick_clock_notify();
        return 1;
  }
  
@@@ -132,7 -124,7 +133,7 @@@ int tick_device_uses_broadcast(struct c
        if (!tick_device_is_functional(dev)) {
                dev->event_handler = tick_handle_periodic;
                tick_device_setup_broadcast_func(dev);
 -              cpumask_set_cpu(cpu, tick_get_broadcast_mask());
 +              cpumask_set_cpu(cpu, tick_broadcast_mask);
                tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
                ret = 1;
        } else {
                 */
                if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
                        int cpu = smp_processor_id();
 -                      cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 +                      cpumask_clear_cpu(cpu, tick_broadcast_mask);
                        tick_broadcast_clear_oneshot(cpu);
                } else {
                        tick_device_setup_broadcast_func(dev);
@@@ -207,8 -199,9 +208,8 @@@ static void tick_do_periodic_broadcast(
  {
        raw_spin_lock(&tick_broadcast_lock);
  
 -      cpumask_and(to_cpumask(tmpmask),
 -                  cpu_online_mask, tick_get_broadcast_mask());
 -      tick_do_broadcast(to_cpumask(tmpmask));
 +      cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
 +      tick_do_broadcast(tmpmask);
  
        raw_spin_unlock(&tick_broadcast_lock);
  }
@@@ -271,12 -264,13 +272,12 @@@ static void tick_do_broadcast_on_off(un
        if (!tick_device_is_functional(dev))
                goto out;
  
 -      bc_stopped = cpumask_empty(tick_get_broadcast_mask());
 +      bc_stopped = cpumask_empty(tick_broadcast_mask);
  
        switch (*reason) {
        case CLOCK_EVT_NOTIFY_BROADCAST_ON:
        case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
 -              if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
 -                      cpumask_set_cpu(cpu, tick_get_broadcast_mask());
 +              if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                clockevents_shutdown(dev);
                break;
        case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
                if (!tick_broadcast_force &&
 -                  cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
 -                      cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 +                  cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
                        if (tick_broadcast_device.mode ==
                            TICKDEV_MODE_PERIODIC)
                                tick_setup_periodic(dev, 0);
                break;
        }
  
 -      if (cpumask_empty(tick_get_broadcast_mask())) {
 +      if (cpumask_empty(tick_broadcast_mask)) {
                if (!bc_stopped)
                        clockevents_shutdown(bc);
        } else if (bc_stopped) {
@@@ -343,10 -338,10 +344,10 @@@ void tick_shutdown_broadcast(unsigned i
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
  
        bc = tick_broadcast_device.evtdev;
 -      cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
 +      cpumask_clear_cpu(cpu, tick_broadcast_mask);
  
        if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
 -              if (bc && cpumask_empty(tick_get_broadcast_mask()))
 +              if (bc && cpumask_empty(tick_broadcast_mask))
                        clockevents_shutdown(bc);
        }
  
@@@ -382,13 -377,13 +383,13 @@@ int tick_resume_broadcast(void
  
                switch (tick_broadcast_device.mode) {
                case TICKDEV_MODE_PERIODIC:
 -                      if (!cpumask_empty(tick_get_broadcast_mask()))
 +                      if (!cpumask_empty(tick_broadcast_mask))
                                tick_broadcast_start_periodic(bc);
                        broadcast = cpumask_test_cpu(smp_processor_id(),
 -                                                   tick_get_broadcast_mask());
 +                                                   tick_broadcast_mask);
                        break;
                case TICKDEV_MODE_ONESHOT:
 -                      if (!cpumask_empty(tick_get_broadcast_mask()))
 +                      if (!cpumask_empty(tick_broadcast_mask))
                                broadcast = tick_resume_broadcast_oneshot(bc);
                        break;
                }
  
  #ifdef CONFIG_TICK_ONESHOT
  
 -/* FIXME: use cpumask_var_t. */
 -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
 +static cpumask_var_t tick_broadcast_oneshot_mask;
 +static cpumask_var_t tick_broadcast_pending_mask;
 +static cpumask_var_t tick_broadcast_force_mask;
  
  /*
   * Exposed for debugging: see timer_list.c
   */
  struct cpumask *tick_get_broadcast_oneshot_mask(void)
  {
 -      return to_cpumask(tick_broadcast_oneshot_mask);
 +      return tick_broadcast_oneshot_mask;
  }
  
 -static int tick_broadcast_set_event(ktime_t expires, int force)
 +/*
 + * Called before going idle with interrupts disabled. Checks whether a
 + * broadcast event from the other core is about to happen. We detected
 + * that in tick_broadcast_oneshot_control(). The callsite can use this
 + * to avoid a deep idle transition as we are about to get the
 + * broadcast IPI right away.
 + */
 +int tick_check_broadcast_expired(void)
  {
 -      struct clock_event_device *bc = tick_broadcast_device.evtdev;
 +      return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
 +}
 +
 +/*
 + * Set broadcast interrupt affinity
 + */
 +static void tick_broadcast_set_affinity(struct clock_event_device *bc,
 +                                      const struct cpumask *cpumask)
 +{
 +      if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
 +              return;
 +
 +      if (cpumask_equal(bc->cpumask, cpumask))
 +              return;
 +
 +      bc->cpumask = cpumask;
 +      irq_set_affinity(bc->irq, bc->cpumask);
 +}
 +
 +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 +                                  ktime_t expires, int force)
 +{
 +      int ret;
  
        if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
                clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
  
 -      return clockevents_program_event(bc, expires, force);
 +      ret = clockevents_program_event(bc, expires, force);
 +      if (!ret)
 +              tick_broadcast_set_affinity(bc, cpumask_of(cpu));
 +      return ret;
  }
  
  int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
   */
  void tick_check_oneshot_broadcast(int cpu)
  {
 -      if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
 +      if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
                struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
  
                clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@@ -481,39 -443,27 +482,39 @@@ static void tick_handle_oneshot_broadca
  {
        struct tick_device *td;
        ktime_t now, next_event;
 -      int cpu;
 +      int cpu, next_cpu = 0;
  
        raw_spin_lock(&tick_broadcast_lock);
  again:
        dev->next_event.tv64 = KTIME_MAX;
        next_event.tv64 = KTIME_MAX;
 -      cpumask_clear(to_cpumask(tmpmask));
 +      cpumask_clear(tmpmask);
        now = ktime_get();
        /* Find all expired events */
 -      for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
 +      for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
                td = &per_cpu(tick_cpu_device, cpu);
 -              if (td->evtdev->next_event.tv64 <= now.tv64)
 -                      cpumask_set_cpu(cpu, to_cpumask(tmpmask));
 -              else if (td->evtdev->next_event.tv64 < next_event.tv64)
 +              if (td->evtdev->next_event.tv64 <= now.tv64) {
 +                      cpumask_set_cpu(cpu, tmpmask);
 +                      /*
 +                       * Mark the remote cpu in the pending mask, so
 +                       * it can avoid reprogramming the cpu local
 +                       * timer in tick_broadcast_oneshot_control().
 +                       */
 +                      cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
 +              } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
                        next_event.tv64 = td->evtdev->next_event.tv64;
 +                      next_cpu = cpu;
 +              }
        }
  
 +      /* Take care of enforced broadcast requests */
 +      cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
 +      cpumask_clear(tick_broadcast_force_mask);
 +
        /*
         * Wakeup the cpus which have an expired event.
         */
 -      tick_do_broadcast(to_cpumask(tmpmask));
 +      tick_do_broadcast(tmpmask);
  
        /*
         * Two reasons for reprogram:
                 * Rearm the broadcast device. If event expired,
                 * repeat the above
                 */
 -              if (tick_broadcast_set_event(next_event, 0))
 +              if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
                        goto again;
        }
        raw_spin_unlock(&tick_broadcast_lock);
@@@ -545,7 -495,6 +546,7 @@@ void tick_broadcast_oneshot_control(uns
        struct clock_event_device *bc, *dev;
        struct tick_device *td;
        unsigned long flags;
 +      ktime_t now;
        int cpu;
  
        /*
  
        raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
        if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
 -              if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
 -                      cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
 +              WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
 +              if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 -                      if (dev->next_event.tv64 < bc->next_event.tv64)
 -                              tick_broadcast_set_event(dev->next_event, 1);
 +                      /*
 +                       * We only reprogram the broadcast timer if we
 +                       * did not mark ourself in the force mask and
 +                       * if the cpu local event is earlier than the
 +                       * broadcast event. If the current CPU is in
 +                       * the force mask, then we are going to be
 +                       * woken by the IPI right away.
 +                       */
 +                      if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
 +                          dev->next_event.tv64 < bc->next_event.tv64)
 +                              tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
                }
        } else {
 -              if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
 -                      cpumask_clear_cpu(cpu,
 -                                        tick_get_broadcast_oneshot_mask());
 +              if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
                        clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
 -                      if (dev->next_event.tv64 != KTIME_MAX)
 -                              tick_program_event(dev->next_event, 1);
 +                      if (dev->next_event.tv64 == KTIME_MAX)
 +                              goto out;
 +                      /*
 +                       * The cpu which was handling the broadcast
 +                       * timer marked this cpu in the broadcast
 +                       * pending mask and fired the broadcast
 +                       * IPI. So we are going to handle the expired
 +                       * event anyway via the broadcast IPI
 +                       * handler. No need to reprogram the timer
 +                       * with an already expired event.
 +                       */
 +                      if (cpumask_test_and_clear_cpu(cpu,
 +                                     tick_broadcast_pending_mask))
 +                              goto out;
 +
 +                      /*
 +                       * If the pending bit is not set, then we are
 +                       * either the CPU handling the broadcast
 +                       * interrupt or we got woken by something else.
 +                       *
 +                       * We are not longer in the broadcast mask, so
 +                       * if the cpu local expiry time is already
 +                       * reached, we would reprogram the cpu local
 +                       * timer with an already expired event.
 +                       *
 +                       * This can lead to a ping-pong when we return
 +                       * to idle and therefor rearm the broadcast
 +                       * timer before the cpu local timer was able
 +                       * to fire. This happens because the forced
 +                       * reprogramming makes sure that the event
 +                       * will happen in the future and depending on
 +                       * the min_delta setting this might be far
 +                       * enough out that the ping-pong starts.
 +                       *
 +                       * If the cpu local next_event has expired
 +                       * then we know that the broadcast timer
 +                       * next_event has expired as well and
 +                       * broadcast is about to be handled. So we
 +                       * avoid reprogramming and enforce that the
 +                       * broadcast handler, which did not run yet,
 +                       * will invoke the cpu local handler.
 +                       *
 +                       * We cannot call the handler directly from
 +                       * here, because we might be in a NOHZ phase
 +                       * and we did not go through the irq_enter()
 +                       * nohz fixups.
 +                       */
 +                      now = ktime_get();
 +                      if (dev->next_event.tv64 <= now.tv64) {
 +                              cpumask_set_cpu(cpu, tick_broadcast_force_mask);
 +                              goto out;
 +                      }
 +                      /*
 +                       * We got woken by something else. Reprogram
 +                       * the cpu local timer device.
 +                       */
 +                      tick_program_event(dev->next_event, 1);
                }
        }
 +out:
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
  }
  
   */
  static void tick_broadcast_clear_oneshot(int cpu)
  {
 -      cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
 +      cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
  }
  
  static void tick_broadcast_init_next_event(struct cpumask *mask,
@@@ -696,16 -582,17 +697,16 @@@ void tick_broadcast_setup_oneshot(struc
                 * oneshot_mask bits for those and program the
                 * broadcast device to fire.
                 */
 -              cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
 -              cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
 -              cpumask_or(tick_get_broadcast_oneshot_mask(),
 -                         tick_get_broadcast_oneshot_mask(),
 -                         to_cpumask(tmpmask));
 +              cpumask_copy(tmpmask, tick_broadcast_mask);
 +              cpumask_clear_cpu(cpu, tmpmask);
 +              cpumask_or(tick_broadcast_oneshot_mask,
 +                         tick_broadcast_oneshot_mask, tmpmask);
  
 -              if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
 +              if (was_periodic && !cpumask_empty(tmpmask)) {
                        clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 -                      tick_broadcast_init_next_event(to_cpumask(tmpmask),
 +                      tick_broadcast_init_next_event(tmpmask,
                                                       tick_next_period);
 -                      tick_broadcast_set_event(tick_next_period, 1);
 +                      tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
                } else
                        bc->next_event.tv64 = KTIME_MAX;
        } else {
@@@ -753,7 -640,7 +754,7 @@@ void tick_shutdown_broadcast_oneshot(un
         * Clear the broadcast mask flag for the dead cpu, but do not
         * stop the broadcast device!
         */
 -      cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
 +      cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
  
        raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
  }
@@@ -777,14 -664,3 +778,14 @@@ bool tick_broadcast_oneshot_available(v
  }
  
  #endif
 +
 +void __init tick_broadcast_init(void)
 +{
 +      alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
 +      alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
 +#ifdef CONFIG_TICK_ONESHOT
 +      alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
 +      alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
 +      alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
 +#endif
 +}