Merge branch 'linus' into timers/core

author Thomas Gleixner <tglx@linutronix.de>

Wed, 24 Apr 2013 18:33:46 +0000 (20:33 +0200)

committer Thomas Gleixner <tglx@linutronix.de>

Wed, 24 Apr 2013 18:33:54 +0000 (20:33 +0200)
author Thomas Gleixner <tglx@linutronix.de>
Wed, 24 Apr 2013 18:33:46 +0000 (20:33 +0200)
committer Thomas Gleixner <tglx@linutronix.de>
Wed, 24 Apr 2013 18:33:54 +0000 (20:33 +0200)
diff --combined arch/x86/Kconfig

index 26bd7926153219b01581e932eab565d63c630114,15b5cef4aa3857a386cb77cffd2ba74243b532c7..9f74f523dfc66c84c097351f59d0d0bbe2fdad8f
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -112,7 -112,7 +112,7 @@@ config X8
         select GENERIC_STRNLEN_USER
         select HAVE_CONTEXT_TRACKING if X86_64
         select HAVE_IRQ_TIME_ACCOUNTING
-       select HAVE_VIRT_TO_BUS
+       select VIRT_TO_BUS
         select MODULES_USE_ELF_REL if X86_32
         select MODULES_USE_ELF_RELA if X86_64
         select CLONE_BACKWARDS if X86_32
@@@ -120,7 -120,6 +120,7 @@@
         select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
         select OLD_SIGACTION if X86_32
         select COMPAT_OLD_SIGACTION if IA32_EMULATION
+ +      select RTC_LIB
   
   config INSTRUCTION_DECODER
         def_bool y
@@@ -1550,6 -1549,7 +1550,7 @@@ config X86_SMA
   config EFI
         bool "EFI runtime service support"
         depends on ACPI
+       select UCS2_STRING
         ---help---
           This enables the kernel to use EFI runtime services that are
           available (such as the EFI variable services).
diff --combined arch/x86/platform/efi/efi.c

index 28d9efacc9b682999a5463a6dffa9f28fca0580c,e4a86a677ce163ec5f911fb14db65590d813aee1..b55d174e503446fe6cc47ea34df463f5ce171c5f
--- 1/arch/x86/platform/efi/efi.c
--- 2/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@@ -41,6 -41,7 +41,7 @@@
   #include <linux/io.h>
   #include <linux/reboot.h>
   #include <linux/bcd.h>
+ #include <linux/ucs2_string.h>
   
   #include <asm/setup.h>
   #include <asm/efi.h>
@@@ -48,10 -49,16 +49,17 @@@
   #include <asm/cacheflush.h>
   #include <asm/tlbflush.h>
   #include <asm/x86_init.h>
+ +#include <asm/rtc.h>
   
   #define EFI_DEBUG     1
   
+ /*
+  * There's some additional metadata associated with each
+  * variable. Intel's reference implementation is 60 bytes - bump that
+  * to account for potential alignment constraints
+  */
+ #define VAR_METADATA_SIZE 64
+ 
   struct efi __read_mostly efi = {
         .mps        = EFI_INVALID_TABLE_ADDR,
         .acpi       = EFI_INVALID_TABLE_ADDR,
@@@ -70,6 -77,13 +78,13 @@@ struct efi_memory_map memmap
   static struct efi efi_phys __initdata;
   static efi_system_table_t efi_systab __initdata;
   
+ static u64 efi_var_store_size;
+ static u64 efi_var_remaining_size;
+ static u64 efi_var_max_var_size;
+ static u64 boot_used_size;
+ static u64 boot_var_size;
+ static u64 active_size;
+ 
   unsigned long x86_efi_facility;
   
   /*
@@@ -99,6 -113,15 +114,15 @@@ static int __init setup_add_efi_memmap(
   }
   early_param("add_efi_memmap", setup_add_efi_memmap);
   
+ static bool efi_no_storage_paranoia;
+ 
+ static int __init setup_storage_paranoia(char *arg)
+ {
+       efi_no_storage_paranoia = true;
+       return 0;
+ }
+ early_param("efi_no_storage_paranoia", setup_storage_paranoia);
+ 
   
   static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
   {
@@@ -163,8 -186,53 +187,53 @@@ static efi_status_t virt_efi_get_next_v
                                                efi_char16_t *name,
                                                efi_guid_t *vendor)
   {
-       return efi_call_virt3(get_next_variable,
-                             name_size, name, vendor);
+       efi_status_t status;
+       static bool finished = false;
+       static u64 var_size;
+ 
+       status = efi_call_virt3(get_next_variable,
+                               name_size, name, vendor);
+ 
+       if (status == EFI_NOT_FOUND) {
+               finished = true;
+               if (var_size < boot_used_size) {
+                       boot_var_size = boot_used_size - var_size;
+                       active_size += boot_var_size;
+               } else {
+                       printk(KERN_WARNING FW_BUG  "efi: Inconsistent initial sizes\n");
+               }
+       }
+ 
+       if (boot_used_size && !finished) {
+               unsigned long size;
+               u32 attr;
+               efi_status_t s;
+               void *tmp;
+ 
+               s = virt_efi_get_variable(name, vendor, &attr, &size, NULL);
+ 
+               if (s != EFI_BUFFER_TOO_SMALL || !size)
+                       return status;
+ 
+               tmp = kmalloc(size, GFP_ATOMIC);
+ 
+               if (!tmp)
+                       return status;
+ 
+               s = virt_efi_get_variable(name, vendor, &attr, &size, tmp);
+ 
+               if (s == EFI_SUCCESS && (attr & EFI_VARIABLE_NON_VOLATILE)) {
+                       var_size += size;
+                       var_size += ucs2_strsize(name, 1024);
+                       active_size += size;
+                       active_size += VAR_METADATA_SIZE;
+                       active_size += ucs2_strsize(name, 1024);
+               }
+ 
+               kfree(tmp);
+       }
+ 
+       return status;
   }
   
   static efi_status_t virt_efi_set_variable(efi_char16_t *name,
@@@ -173,9 -241,34 +242,34 @@@
                                           unsigned long data_size,
                                           void *data)
   {
-       return efi_call_virt5(set_variable,
-                             name, vendor, attr,
-                             data_size, data);
+       efi_status_t status;
+       u32 orig_attr = 0;
+       unsigned long orig_size = 0;
+ 
+       status = virt_efi_get_variable(name, vendor, &orig_attr, &orig_size,
+                                      NULL);
+ 
+       if (status != EFI_BUFFER_TOO_SMALL)
+               orig_size = 0;
+ 
+       status = efi_call_virt5(set_variable,
+                               name, vendor, attr,
+                               data_size, data);
+ 
+       if (status == EFI_SUCCESS) {
+               if (orig_size) {
+                       active_size -= orig_size;
+                       active_size -= ucs2_strsize(name, 1024);
+                       active_size -= VAR_METADATA_SIZE;
+               }
+               if (data_size) {
+                       active_size += data_size;
+                       active_size += ucs2_strsize(name, 1024);
+                       active_size += VAR_METADATA_SIZE;
+               }
+       }
+ 
+       return status;
   }
   
   static efi_status_t virt_efi_query_variable_info(u32 attr,
@@@ -259,10 -352,10 +353,10 @@@ static efi_status_t __init phys_efi_get
   
   int efi_set_rtc_mmss(unsigned long nowtime)
   {
- -      int real_seconds, real_minutes;
         efi_status_t    status;
         efi_time_t      eft;
         efi_time_cap_t  cap;
+ +      struct rtc_time tm;
   
         status = efi.get_time(&eft, &cap);
         if (status != EFI_SUCCESS) {
@@@ -270,20 -363,13 +364,20 @@@
                 return -1;
         }
   
- -      real_seconds = nowtime % 60;
- -      real_minutes = nowtime / 60;
- -      if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
- -              real_minutes += 30;
- -      real_minutes %= 60;
- -      eft.minute = real_minutes;
- -      eft.second = real_seconds;
+ +      rtc_time_to_tm(nowtime, &tm);
+ +      if (!rtc_valid_tm(&tm)) {
+ +              eft.year = tm.tm_year + 1900;
+ +              eft.month = tm.tm_mon + 1;
+ +              eft.day = tm.tm_mday;
+ +              eft.minute = tm.tm_min;
+ +              eft.second = tm.tm_sec;
+ +              eft.nanosecond = 0;
+ +      } else {
+ +              printk(KERN_ERR
+ +                     "%s: Invalid EFI RTC value: write of %lx to EFI RTC failed\n",
+ +                     __FUNCTION__, nowtime);
+ +              return -1;
+ +      }
   
         status = efi.set_time(&eft);
         if (status != EFI_SUCCESS) {
@@@ -690,6 -776,9 +784,9 @@@ void __init efi_init(void
         char vendor[100] = "unknown";
         int i = 0;
         void *tmp;
+       struct setup_data *data;
+       struct efi_var_bootdata *efi_var_data;
+       u64 pa_data;
   
   #ifdef CONFIG_X86_32
         if (boot_params.efi_info.efi_systab_hi ||
@@@ -707,6 -796,22 +804,22 @@@
         if (efi_systab_init(efi_phys.systab))
                 return;
   
+       pa_data = boot_params.hdr.setup_data;
+       while (pa_data) {
+               data = early_ioremap(pa_data, sizeof(*efi_var_data));
+               if (data->type == SETUP_EFI_VARS) {
+                       efi_var_data = (struct efi_var_bootdata *)data;
+ 
+                       efi_var_store_size = efi_var_data->store_size;
+                       efi_var_remaining_size = efi_var_data->remaining_size;
+                       efi_var_max_var_size = efi_var_data->max_var_size;
+               }
+               pa_data = data->next;
+               early_iounmap(data, sizeof(*efi_var_data));
+       }
+ 
+       boot_used_size = efi_var_store_size - efi_var_remaining_size;
+ 
         set_bit(EFI_SYSTEM_TABLES, &x86_efi_facility);
   
         /*
@@@ -1007,3 -1112,48 +1120,48 @@@ u64 efi_mem_attributes(unsigned long ph
         }
         return 0;
   }
+ 
+ /*
+  * Some firmware has serious problems when using more than 50% of the EFI
+  * variable store, i.e. it triggers bugs that can brick machines. Ensure that
+  * we never use more than this safe limit.
+  *
+  * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
+  * store.
+  */
+ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+ {
+       efi_status_t status;
+       u64 storage_size, remaining_size, max_size;
+ 
+       status = efi.query_variable_info(attributes, &storage_size,
+                                        &remaining_size, &max_size);
+       if (status != EFI_SUCCESS)
+               return status;
+ 
+       if (!max_size && remaining_size > size)
+               printk_once(KERN_ERR FW_BUG "Broken EFI implementation"
+                           " is returning MaxVariableSize=0\n");
+       /*
+        * Some firmware implementations refuse to boot if there's insufficient
+        * space in the variable store. We account for that by refusing the
+        * write if permitting it would reduce the available space to under
+        * 50%. However, some firmware won't reclaim variable space until
+        * after the used (not merely the actively used) space drops below
+        * a threshold. We can approximate that case with the value calculated
+        * above. If both the firmware and our calculations indicate that the
+        * available space would drop below 50%, refuse the write.
+        */
+ 
+       if (!storage_size || size > remaining_size ||
+           (max_size && size > max_size))
+               return EFI_OUT_OF_RESOURCES;
+ 
+       if (!efi_no_storage_paranoia &&
+           ((active_size + size + VAR_METADATA_SIZE > storage_size / 2) &&
+            (remaining_size - size < storage_size / 2)))
+               return EFI_OUT_OF_RESOURCES;
+ 
+       return EFI_SUCCESS;
+ }
+ EXPORT_SYMBOL_GPL(efi_query_variable_store);
diff --combined include/linux/sched.h

index d13341b550967a0e7fa43663921cf23136fd4685,e692a022527bdaaace8b388268b0c280946fc5c2..78694315c1b4301c89a69d6e1bd74d3f908207e5
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -163,9 -163,10 +163,10 @@@ print_cfs_rq(struct seq_file *m, int cp
   #define TASK_DEAD             64
   #define TASK_WAKEKILL         128
   #define TASK_WAKING           256
- #define TASK_STATE_MAX                512
+ #define TASK_PARKED           512
+ #define TASK_STATE_MAX                1024
   
- #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+ #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
   
   extern char ___assert_task_state[1 - 2*!!(
                 sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
@@@ -526,8 -527,7 +527,8 @@@ struct signal_struct 
         unsigned int            has_child_subreaper:1;
   
         /* POSIX.1b Interval Timers */
- -      struct list_head posix_timers;
+ +      int                     posix_timer_id;
+ +      struct list_head        posix_timers;
   
         /* ITIMER_REAL timer for the process */
         struct hrtimer real_timer;
diff --combined kernel/hrtimer.c

index c0875ae0de1791239fdc6fadcc1a7f6c8ab09729,14be27feda491da1c3dc9990a5ae80ce649570aa..609d8ff38b745c7d02b76b69be09b82176050bf6
--- 1/kernel/hrtimer.c
--- 2/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@@ -63,6 -63,7 +63,7 @@@
   DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
   {
   
+       .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
         .clock_base =
         {
                 {
@@@ -83,12 -84,6 +84,12 @@@
                         .get_time = &ktime_get_boottime,
                         .resolution = KTIME_LOW_RES,
                 },
+ +              {
+ +                      .index = HRTIMER_BASE_TAI,
+ +                      .clockid = CLOCK_TAI,
+ +                      .get_time = &ktime_get_clocktai,
+ +                      .resolution = KTIME_LOW_RES,
+ +              },
         }
   };
   
@@@ -96,7 -91,6 +97,7 @@@ static const int hrtimer_clock_to_base_
         [CLOCK_REALTIME]        = HRTIMER_BASE_REALTIME,
         [CLOCK_MONOTONIC]       = HRTIMER_BASE_MONOTONIC,
         [CLOCK_BOOTTIME]        = HRTIMER_BASE_BOOTTIME,
+ +      [CLOCK_TAI]             = HRTIMER_BASE_TAI,
   };
   
   static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@@ -113,10 -107,8 +114,10 @@@ static void hrtimer_get_softirq_time(st
   {
         ktime_t xtim, mono, boot;
         struct timespec xts, tom, slp;
+ +      s32 tai_offset;
   
         get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+ +      tai_offset = timekeeping_get_tai_offset();
   
         xtim = timespec_to_ktime(xts);
         mono = ktime_add(xtim, timespec_to_ktime(tom));
@@@ -124,8 -116,6 +125,8 @@@
         base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
         base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
         base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+ +      base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+ +                              ktime_add(xtim, ktime_set(tai_offset, 0));
   }
   
   /*
@@@ -286,10 -276,6 +287,10 @@@ ktime_t ktime_add_ns(const ktime_t kt, 
         } else {
                 unsigned long rem = do_div(nsec, NSEC_PER_SEC);
   
+ +              /* Make sure nsec fits into long */
+ +              if (unlikely(nsec > KTIME_SEC_MAX))
+ +                      return (ktime_t){ .tv64 = KTIME_MAX };
+ +
                 tmp = ktime_set((long)nsec, rem);
         }
   
@@@ -666,9 -652,8 +667,9 @@@ static inline ktime_t hrtimer_update_ba
   {
         ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
         ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ +      ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
   
- -      return ktime_get_update_offsets(offs_real, offs_boot);
+ +      return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
   }
   
   /*
@@@ -1026,8 -1011,7 +1027,8 @@@ int __hrtimer_start_range_ns(struct hrt
    * @timer:    the timer to be added
    * @tim:      expiry time
    * @delta_ns: "slack" range for the timer
- - * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ + * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
+ + *            relative (HRTIMER_MODE_REL)
    *
    * Returns:
    *  0 on success
@@@ -1044,8 -1028,7 +1045,8 @@@ EXPORT_SYMBOL_GPL(hrtimer_start_range_n
    * hrtimer_start - (re)start an hrtimer on the current CPU
    * @timer:    the timer to be added
    * @tim:      expiry time
- - * @mode:     expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ + * @mode:     expiry mode: absolute (HRTIMER_MODE_ABS) or
+ + *            relative (HRTIMER_MODE_REL)
    *
    * Returns:
    *  0 on success
@@@ -1327,8 -1310,6 +1328,8 @@@ retry
   
                                 expires = ktime_sub(hrtimer_get_expires(timer),
                                                     base->offset);
+ +                              if (expires.tv64 < 0)
+ +                                      expires.tv64 = KTIME_MAX;
                                 if (expires.tv64 < expires_next.tv64)
                                         expires_next = expires;
                                 break;
@@@ -1662,8 -1643,6 +1663,6 @@@ static void __cpuinit init_hrtimers_cpu
         struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
         int i;
   
-       raw_spin_lock_init(&cpu_base->lock);
- 
         for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                 cpu_base->clock_base[i].cpu_base = cpu_base;
                 timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --combined kernel/time/tick-broadcast.c

index f8d2109ef0a25db2a3d4152b4d301e3534ce58ff,7f32fe0e52cd46489c8d90e4b85f9d74204aab16..6e23fde83dbeb21faeca1d5500003cd7e1a373e0
--- 1/kernel/time/tick-broadcast.c
--- 2/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@@ -28,8 -28,9 +28,8 @@@
    */
   
   static struct tick_device tick_broadcast_device;
- -/* FIXME: Use cpumask_var_t. */
- -static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
- -static DECLARE_BITMAP(tmpmask, NR_CPUS);
+ +static cpumask_var_t tick_broadcast_mask;
+ +static cpumask_var_t tmpmask;
   static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
   static int tick_broadcast_force;
   
@@@ -49,7 -50,7 +49,7 @@@ struct tick_device *tick_get_broadcast_
   
   struct cpumask *tick_get_broadcast_mask(void)
   {
- -      return to_cpumask(tick_broadcast_mask);
+ +      return tick_broadcast_mask;
   }
   
   /*
@@@ -66,25 -67,16 +66,26 @@@ static void tick_broadcast_start_period
    */
   int tick_check_broadcast_device(struct clock_event_device *dev)
   {
-       if ((tick_broadcast_device.evtdev &&
+       if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+           (tick_broadcast_device.evtdev &&
              tick_broadcast_device.evtdev->rating >= dev->rating) ||
              (dev->features & CLOCK_EVT_FEAT_C3STOP))
                 return 0;
   
         clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
         tick_broadcast_device.evtdev = dev;
- -      if (!cpumask_empty(tick_get_broadcast_mask()))
+ +      if (!cpumask_empty(tick_broadcast_mask))
                 tick_broadcast_start_periodic(dev);
+ +      /*
+ +       * Inform all cpus about this. We might be in a situation
+ +       * where we did not switch to oneshot mode because the per cpu
+ +       * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
+ +       * of a oneshot capable broadcast device. Without that
+ +       * notification the systems stays stuck in periodic mode
+ +       * forever.
+ +       */
+ +      if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
+ +              tick_clock_notify();
         return 1;
   }
   
@@@ -132,7 -124,7 +133,7 @@@ int tick_device_uses_broadcast(struct c
         if (!tick_device_is_functional(dev)) {
                 dev->event_handler = tick_handle_periodic;
                 tick_device_setup_broadcast_func(dev);
- -              cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+ +              cpumask_set_cpu(cpu, tick_broadcast_mask);
                 tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
                 ret = 1;
         } else {
@@@ -143,7 -135,7 +144,7 @@@
                  */
                 if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
                         int cpu = smp_processor_id();
- -                      cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ +                      cpumask_clear_cpu(cpu, tick_broadcast_mask);
                         tick_broadcast_clear_oneshot(cpu);
                 } else {
                         tick_device_setup_broadcast_func(dev);
@@@ -207,8 -199,9 +208,8 @@@ static void tick_do_periodic_broadcast(
   {
         raw_spin_lock(&tick_broadcast_lock);
   
- -      cpumask_and(to_cpumask(tmpmask),
- -                  cpu_online_mask, tick_get_broadcast_mask());
- -      tick_do_broadcast(to_cpumask(tmpmask));
+ +      cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
+ +      tick_do_broadcast(tmpmask);
   
         raw_spin_unlock(&tick_broadcast_lock);
   }
@@@ -271,12 -264,13 +272,12 @@@ static void tick_do_broadcast_on_off(un
         if (!tick_device_is_functional(dev))
                 goto out;
   
- -      bc_stopped = cpumask_empty(tick_get_broadcast_mask());
+ +      bc_stopped = cpumask_empty(tick_broadcast_mask);
   
         switch (*reason) {
         case CLOCK_EVT_NOTIFY_BROADCAST_ON:
         case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- -              if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
- -                      cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+ +              if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
                         if (tick_broadcast_device.mode ==
                             TICKDEV_MODE_PERIODIC)
                                 clockevents_shutdown(dev);
@@@ -286,7 -280,8 +287,7 @@@
                 break;
         case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
                 if (!tick_broadcast_force &&
- -                  cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
- -                      cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ +                  cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
                         if (tick_broadcast_device.mode ==
                             TICKDEV_MODE_PERIODIC)
                                 tick_setup_periodic(dev, 0);
@@@ -294,7 -289,7 +295,7 @@@
                 break;
         }
   
- -      if (cpumask_empty(tick_get_broadcast_mask())) {
+ +      if (cpumask_empty(tick_broadcast_mask)) {
                 if (!bc_stopped)
                         clockevents_shutdown(bc);
         } else if (bc_stopped) {
@@@ -343,10 -338,10 +344,10 @@@ void tick_shutdown_broadcast(unsigned i
         raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
   
         bc = tick_broadcast_device.evtdev;
- -      cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ +      cpumask_clear_cpu(cpu, tick_broadcast_mask);
   
         if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
- -              if (bc && cpumask_empty(tick_get_broadcast_mask()))
+ +              if (bc && cpumask_empty(tick_broadcast_mask))
                         clockevents_shutdown(bc);
         }
   
@@@ -382,13 -377,13 +383,13 @@@ int tick_resume_broadcast(void
   
                 switch (tick_broadcast_device.mode) {
                 case TICKDEV_MODE_PERIODIC:
- -                      if (!cpumask_empty(tick_get_broadcast_mask()))
+ +                      if (!cpumask_empty(tick_broadcast_mask))
                                 tick_broadcast_start_periodic(bc);
                         broadcast = cpumask_test_cpu(smp_processor_id(),
- -                                                   tick_get_broadcast_mask());
+ +                                                   tick_broadcast_mask);
                         break;
                 case TICKDEV_MODE_ONESHOT:
- -                      if (!cpumask_empty(tick_get_broadcast_mask()))
+ +                      if (!cpumask_empty(tick_broadcast_mask))
                                 broadcast = tick_resume_broadcast_oneshot(bc);
                         break;
                 }
@@@ -401,58 -396,25 +402,58 @@@
   
   #ifdef CONFIG_TICK_ONESHOT
   
- -/* FIXME: use cpumask_var_t. */
- -static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+ +static cpumask_var_t tick_broadcast_oneshot_mask;
+ +static cpumask_var_t tick_broadcast_pending_mask;
+ +static cpumask_var_t tick_broadcast_force_mask;
   
   /*
    * Exposed for debugging: see timer_list.c
    */
   struct cpumask *tick_get_broadcast_oneshot_mask(void)
   {
- -      return to_cpumask(tick_broadcast_oneshot_mask);
+ +      return tick_broadcast_oneshot_mask;
   }
   
- -static int tick_broadcast_set_event(ktime_t expires, int force)
+ +/*
+ + * Called before going idle with interrupts disabled. Checks whether a
+ + * broadcast event from the other core is about to happen. We detected
+ + * that in tick_broadcast_oneshot_control(). The callsite can use this
+ + * to avoid a deep idle transition as we are about to get the
+ + * broadcast IPI right away.
+ + */
+ +int tick_check_broadcast_expired(void)
   {
- -      struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ +      return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+ +}
+ +
+ +/*
+ + * Set broadcast interrupt affinity
+ + */
+ +static void tick_broadcast_set_affinity(struct clock_event_device *bc,
+ +                                      const struct cpumask *cpumask)
+ +{
+ +      if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
+ +              return;
+ +
+ +      if (cpumask_equal(bc->cpumask, cpumask))
+ +              return;
+ +
+ +      bc->cpumask = cpumask;
+ +      irq_set_affinity(bc->irq, bc->cpumask);
+ +}
+ +
+ +static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+ +                                  ktime_t expires, int force)
+ +{
+ +      int ret;
   
         if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
                 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
   
- -      return clockevents_program_event(bc, expires, force);
+ +      ret = clockevents_program_event(bc, expires, force);
+ +      if (!ret)
+ +              tick_broadcast_set_affinity(bc, cpumask_of(cpu));
+ +      return ret;
   }
   
   int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@@ -467,7 -429,7 +468,7 @@@
    */
   void tick_check_oneshot_broadcast(int cpu)
   {
- -      if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
+ +      if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
                 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
   
                 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@@ -481,39 -443,27 +482,39 @@@ static void tick_handle_oneshot_broadca
   {
         struct tick_device *td;
         ktime_t now, next_event;
- -      int cpu;
+ +      int cpu, next_cpu = 0;
   
         raw_spin_lock(&tick_broadcast_lock);
   again:
         dev->next_event.tv64 = KTIME_MAX;
         next_event.tv64 = KTIME_MAX;
- -      cpumask_clear(to_cpumask(tmpmask));
+ +      cpumask_clear(tmpmask);
         now = ktime_get();
         /* Find all expired events */
- -      for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
+ +      for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
                 td = &per_cpu(tick_cpu_device, cpu);
- -              if (td->evtdev->next_event.tv64 <= now.tv64)
- -                      cpumask_set_cpu(cpu, to_cpumask(tmpmask));
- -              else if (td->evtdev->next_event.tv64 < next_event.tv64)
+ +              if (td->evtdev->next_event.tv64 <= now.tv64) {
+ +                      cpumask_set_cpu(cpu, tmpmask);
+ +                      /*
+ +                       * Mark the remote cpu in the pending mask, so
+ +                       * it can avoid reprogramming the cpu local
+ +                       * timer in tick_broadcast_oneshot_control().
+ +                       */
+ +                      cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
+ +              } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
                         next_event.tv64 = td->evtdev->next_event.tv64;
+ +                      next_cpu = cpu;
+ +              }
         }
   
+ +      /* Take care of enforced broadcast requests */
+ +      cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
+ +      cpumask_clear(tick_broadcast_force_mask);
+ +
         /*
          * Wakeup the cpus which have an expired event.
          */
- -      tick_do_broadcast(to_cpumask(tmpmask));
+ +      tick_do_broadcast(tmpmask);
   
         /*
          * Two reasons for reprogram:
@@@ -530,7 -480,7 +531,7 @@@
                  * Rearm the broadcast device. If event expired,
                  * repeat the above
                  */
- -              if (tick_broadcast_set_event(next_event, 0))
+ +              if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
                         goto again;
         }
         raw_spin_unlock(&tick_broadcast_lock);
@@@ -545,7 -495,6 +546,7 @@@ void tick_broadcast_oneshot_control(uns
         struct clock_event_device *bc, *dev;
         struct tick_device *td;
         unsigned long flags;
+ +      ktime_t now;
         int cpu;
   
         /*
@@@ -570,84 -519,21 +571,84 @@@
   
         raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
         if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
- -              if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
- -                      cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ +              WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+ +              if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
                         clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
- -                      if (dev->next_event.tv64 < bc->next_event.tv64)
- -                              tick_broadcast_set_event(dev->next_event, 1);
+ +                      /*
+ +                       * We only reprogram the broadcast timer if we
+ +                       * did not mark ourself in the force mask and
+ +                       * if the cpu local event is earlier than the
+ +                       * broadcast event. If the current CPU is in
+ +                       * the force mask, then we are going to be
+ +                       * woken by the IPI right away.
+ +                       */
+ +                      if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
+ +                          dev->next_event.tv64 < bc->next_event.tv64)
+ +                              tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
                 }
         } else {
- -              if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
- -                      cpumask_clear_cpu(cpu,
- -                                        tick_get_broadcast_oneshot_mask());
+ +              if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
                         clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
- -                      if (dev->next_event.tv64 != KTIME_MAX)
- -                              tick_program_event(dev->next_event, 1);
+ +                      if (dev->next_event.tv64 == KTIME_MAX)
+ +                              goto out;
+ +                      /*
+ +                       * The cpu which was handling the broadcast
+ +                       * timer marked this cpu in the broadcast
+ +                       * pending mask and fired the broadcast
+ +                       * IPI. So we are going to handle the expired
+ +                       * event anyway via the broadcast IPI
+ +                       * handler. No need to reprogram the timer
+ +                       * with an already expired event.
+ +                       */
+ +                      if (cpumask_test_and_clear_cpu(cpu,
+ +                                     tick_broadcast_pending_mask))
+ +                              goto out;
+ +
+ +                      /*
+ +                       * If the pending bit is not set, then we are
+ +                       * either the CPU handling the broadcast
+ +                       * interrupt or we got woken by something else.
+ +                       *
+ +                       * We are not longer in the broadcast mask, so
+ +                       * if the cpu local expiry time is already
+ +                       * reached, we would reprogram the cpu local
+ +                       * timer with an already expired event.
+ +                       *
+ +                       * This can lead to a ping-pong when we return
+ +                       * to idle and therefor rearm the broadcast
+ +                       * timer before the cpu local timer was able
+ +                       * to fire. This happens because the forced
+ +                       * reprogramming makes sure that the event
+ +                       * will happen in the future and depending on
+ +                       * the min_delta setting this might be far
+ +                       * enough out that the ping-pong starts.
+ +                       *
+ +                       * If the cpu local next_event has expired
+ +                       * then we know that the broadcast timer
+ +                       * next_event has expired as well and
+ +                       * broadcast is about to be handled. So we
+ +                       * avoid reprogramming and enforce that the
+ +                       * broadcast handler, which did not run yet,
+ +                       * will invoke the cpu local handler.
+ +                       *
+ +                       * We cannot call the handler directly from
+ +                       * here, because we might be in a NOHZ phase
+ +                       * and we did not go through the irq_enter()
+ +                       * nohz fixups.
+ +                       */
+ +                      now = ktime_get();
+ +                      if (dev->next_event.tv64 <= now.tv64) {
+ +                              cpumask_set_cpu(cpu, tick_broadcast_force_mask);
+ +                              goto out;
+ +                      }
+ +                      /*
+ +                       * We got woken by something else. Reprogram
+ +                       * the cpu local timer device.
+ +                       */
+ +                      tick_program_event(dev->next_event, 1);
                 }
         }
+ +out:
         raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
   }
   
@@@ -658,7 -544,7 +659,7 @@@
    */
   static void tick_broadcast_clear_oneshot(int cpu)
   {
- -      cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ +      cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
   }
   
   static void tick_broadcast_init_next_event(struct cpumask *mask,
@@@ -696,16 -582,17 +697,16 @@@ void tick_broadcast_setup_oneshot(struc
                  * oneshot_mask bits for those and program the
                  * broadcast device to fire.
                  */
- -              cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
- -              cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
- -              cpumask_or(tick_get_broadcast_oneshot_mask(),
- -                         tick_get_broadcast_oneshot_mask(),
- -                         to_cpumask(tmpmask));
+ +              cpumask_copy(tmpmask, tick_broadcast_mask);
+ +              cpumask_clear_cpu(cpu, tmpmask);
+ +              cpumask_or(tick_broadcast_oneshot_mask,
+ +                         tick_broadcast_oneshot_mask, tmpmask);
   
- -              if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+ +              if (was_periodic && !cpumask_empty(tmpmask)) {
                         clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- -                      tick_broadcast_init_next_event(to_cpumask(tmpmask),
+ +                      tick_broadcast_init_next_event(tmpmask,
                                                        tick_next_period);
- -                      tick_broadcast_set_event(tick_next_period, 1);
+ +                      tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
                 } else
                         bc->next_event.tv64 = KTIME_MAX;
         } else {
@@@ -753,7 -640,7 +754,7 @@@ void tick_shutdown_broadcast_oneshot(un
          * Clear the broadcast mask flag for the dead cpu, but do not
          * stop the broadcast device!
          */
- -      cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ +      cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
   
         raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
   }
@@@ -777,14 -664,3 +778,14 @@@ bool tick_broadcast_oneshot_available(v
   }
   
   #endif
+ +
+ +void __init tick_broadcast_init(void)
+ +{
+ +      alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+ +      alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+ +#ifdef CONFIG_TICK_ONESHOT
+ +      alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+ +      alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+ +      alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
+ +#endif
+ +}
author	Thomas Gleixner <tglx@linutronix.de>
	Wed, 24 Apr 2013 18:33:46 +0000 (20:33 +0200)
committer	Thomas Gleixner <tglx@linutronix.de>
	Wed, 24 Apr 2013 18:33:54 +0000 (20:33 +0200)
		1	2
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/platform/efi/efi.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/hrtimer.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/time/tick-broadcast.c	patch \|	diff1 \|	diff2 \|	blob \| history