Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)
diff --combined arch/x86/include/asm/apic.h

index 3ea51a84a0e447a1644851500c3470cfd99a46c2,a907d4d251a83baceed25ac6a29689e9c49484ed..f34261296ffb71bc9f62f57ecf069508fe40d9d6
--- 1/arch/x86/include/asm/apic.h
--- 2/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@@ -306,8 -306,7 +306,8 @@@ struct apic 
         unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
         unsigned long (*check_apicid_present)(int apicid);
   
- -      void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
+ +      void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
+ +                                       const struct cpumask *mask);
         void (*init_apic_ldr)(void);
   
         void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
@@@ -332,9 -331,9 +332,9 @@@
         unsigned long (*set_apic_id)(unsigned int id);
         unsigned long apic_id_mask;
   
- -      unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
- -      unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
- -                                             const struct cpumask *andmask);
+ +      int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
+ +                                    const struct cpumask *andmask,
+ +                                    unsigned int *apicid);
   
         /* ipi */
         void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@@ -465,8 -464,6 +465,8 @@@ static inline u32 safe_apic_wait_icr_id
         return apic->safe_wait_icr_idle();
   }
   
+ +extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
+ +
   #else /* CONFIG_X86_LOCAL_APIC */
   
   static inline u32 apic_read(u32 reg) { return 0; }
@@@ -476,7 -473,6 +476,7 @@@ static inline u64 apic_icr_read(void) 
   static inline void apic_icr_write(u32 low, u32 high) { }
   static inline void apic_wait_icr_idle(void) { }
   static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
+ +static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
   
   #endif /* CONFIG_X86_LOCAL_APIC */
   
@@@ -541,12 -537,7 +541,12 @@@ static inline const struct cpumask *def
   #endif
   }
   
- DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+ +static inline const struct cpumask *online_target_cpus(void)
+ +{
+ +      return cpu_online_mask;
+ +}
+ +
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
   
   
   static inline unsigned int read_apic_id(void)
@@@ -595,50 -586,21 +595,50 @@@ static inline int default_phys_pkg_id(i
   
   #endif
   
- -static inline unsigned int
- -default_cpu_mask_to_apicid(const struct cpumask *cpumask)
+ +static inline int
+ +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ +                          const struct cpumask *andmask,
+ +                          unsigned int *apicid)
   {
- -      return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
+ +      unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
+ +                               cpumask_bits(andmask)[0] &
+ +                               cpumask_bits(cpu_online_mask)[0] &
+ +                               APIC_ALL_CPUS;
+ +
+ +      if (likely(cpu_mask)) {
+ +              *apicid = (unsigned int)cpu_mask;
+ +              return 0;
+ +      } else {
+ +              return -EINVAL;
+ +      }
   }
   
- -static inline unsigned int
+ +extern int
   default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
- -                             const struct cpumask *andmask)
+ +                             const struct cpumask *andmask,
+ +                             unsigned int *apicid);
+ +
+ +static inline void
+ +flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ +                            const struct cpumask *mask)
   {
- -      unsigned long mask1 = cpumask_bits(cpumask)[0];
- -      unsigned long mask2 = cpumask_bits(andmask)[0];
- -      unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
+ +      /* Careful. Some cpus do not strictly honor the set of cpus
+ +       * specified in the interrupt destination when using lowest
+ +       * priority interrupt delivery mode.
+ +       *
+ +       * In particular there was a hyperthreading cpu observed to
+ +       * deliver interrupts to the wrong hyperthread when only one
+ +       * hyperthread was specified in the interrupt desitination.
+ +       */
+ +      cpumask_clear(retmask);
+ +      cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
+ +}
   
- -      return (unsigned int)(mask1 & mask2 & mask3);
+ +static inline void
+ +default_vector_allocation_domain(int cpu, struct cpumask *retmask,
+ +                               const struct cpumask *mask)
+ +{
+ +      cpumask_copy(retmask, cpumask_of(cpu));
   }
   
   static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
diff --combined arch/x86/include/asm/paravirt.h

index 0b47ddb6f00b300366253a343c4a78424a8fc280,7e2c2a6357374ed50bcaafcc43606430411cfc1d..a0facf3908d7fc0e0762d968d5ec463f7a68b057
--- 1/arch/x86/include/asm/paravirt.h
--- 2/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@@ -128,11 -128,21 +128,11 @@@ static inline u64 paravirt_read_msr(uns
         return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
   }
   
- -static inline int paravirt_rdmsr_regs(u32 *regs)
- -{
- -      return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs);
- -}
- -
   static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
   {
         return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
   }
   
- -static inline int paravirt_wrmsr_regs(u32 *regs)
- -{
- -      return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs);
- -}
- -
   /* These should all do BUG_ON(_err), but our headers are too tangled. */
   #define rdmsr(msr, val1, val2)                        \
   do {                                          \
@@@ -166,6 -176,9 +166,6 @@@ do {                                               
         _err;                                   \
   })
   
- -#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs)
- -#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs)
- -
   static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
   {
         int err;
@@@ -173,6 -186,32 +173,6 @@@
         *p = paravirt_read_msr(msr, &err);
         return err;
   }
- -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
- -{
- -      u32 gprs[8] = { 0 };
- -      int err;
- -
- -      gprs[1] = msr;
- -      gprs[7] = 0x9c5a203a;
- -
- -      err = paravirt_rdmsr_regs(gprs);
- -
- -      *p = gprs[0] | ((u64)gprs[2] << 32);
- -
- -      return err;
- -}
- -
- -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
- -{
- -      u32 gprs[8] = { 0 };
- -
- -      gprs[0] = (u32)val;
- -      gprs[1] = msr;
- -      gprs[2] = val >> 32;
- -      gprs[7] = 0x9c5a203a;
- -
- -      return paravirt_wrmsr_regs(gprs);
- -}
   
   static inline u64 paravirt_read_tsc(void)
   {
@@@ -213,8 -252,6 +213,8 @@@ do {                                               
         high = _l >> 32;                        \
   } while (0)
   
+ +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
+ +
   static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
   {
         return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
@@@ -360,9 -397,10 +360,10 @@@ static inline void __flush_tlb_single(u
   
   static inline void flush_tlb_others(const struct cpumask *cpumask,
                                     struct mm_struct *mm,
-                                   unsigned long va)
+                                   unsigned long start,
+                                   unsigned long end)
   {
-       PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
+       PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
   }
   
   static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --combined arch/x86/include/asm/paravirt_types.h

index 8613cbb7ba41e63d8626e2bae1356793ca279a1d,600a5fcac9cd1e3c4551313163ae7ebc897f7c13..142236ed83af580c06e2e00893501e3fc38b0e65
--- 1/arch/x86/include/asm/paravirt_types.h
--- 2/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@@ -153,7 -153,9 +153,7 @@@ struct pv_cpu_ops 
         /* MSR, PMC and TSR operations.
            err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
         u64 (*read_msr)(unsigned int msr, int *err);
- -      int (*rdmsr_regs)(u32 *regs);
         int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
- -      int (*wrmsr_regs)(u32 *regs);
   
         u64 (*read_tsc)(void);
         u64 (*read_pmc)(int counter);
@@@ -248,7 -250,8 +248,8 @@@ struct pv_mmu_ops 
         void (*flush_tlb_single)(unsigned long addr);
         void (*flush_tlb_others)(const struct cpumask *cpus,
                                  struct mm_struct *mm,
-                                unsigned long va);
+                                unsigned long start,
+                                unsigned long end);
   
         /* Hooks for allocating and freeing a pagetable top-level */
         int  (*pgd_alloc)(struct mm_struct *mm);
diff --combined arch/x86/include/asm/smp.h

index 2ffa95dc2333bcc5a36efb898fd7ad12e95f5842,cc1df2b5cc65afcbd6f89dc252a3fad48ce6a5b1..4f19a1526037364dd9a0b5f41f0125ad52d15988
--- 1/arch/x86/include/asm/smp.h
--- 2/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@@ -31,12 -31,12 +31,12 @@@ static inline bool cpu_has_ht_siblings(
         return has_siblings;
   }
   
- DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
- DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
+ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
   /* cpus sharing the last level cache: */
- DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
- DECLARE_PER_CPU(u16, cpu_llc_id);
- DECLARE_PER_CPU(int, cpu_number);
+ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
   
   static inline struct cpumask *cpu_sibling_mask(int cpu)
   {
@@@ -53,10 -53,10 +53,10 @@@ static inline struct cpumask *cpu_llc_s
         return per_cpu(cpu_llc_shared_map, cpu);
   }
   
- DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
- DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
   #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
- DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
   #endif
   
   /* Static state in head.S used to set up a CPU */
@@@ -169,6 -169,11 +169,6 @@@ void x86_idle_thread_init(unsigned int 
   void smp_store_cpu_info(int id);
   #define cpu_physical_id(cpu)  per_cpu(x86_cpu_to_apicid, cpu)
   
- -/* We don't mark CPUs online until __cpu_up(), so we need another measure */
- -static inline int num_booting_cpus(void)
- -{
- -      return cpumask_weight(cpu_callout_mask);
- -}
   #else /* !CONFIG_SMP */
   #define wbinvd_on_cpu(cpu)     wbinvd()
   static inline int wbinvd_on_all_cpus(void)
diff --combined arch/x86/kernel/apic/apic.c

index 98e24131ff3a831fce89d957fe8b3cb9740264dd,0443b6482214dd1c081f69d34bdb297b63512e6e..24deb308232824225e1e7f8d3933ea694778da4d
--- 1/arch/x86/kernel/apic/apic.c
--- 2/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@@ -75,8 -75,8 +75,8 @@@ physid_mask_t phys_cpu_present_map
   /*
    * Map cpu index to physical APIC ID
    */
- DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
- DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
   EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
   EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
   
@@@ -88,7 -88,7 +88,7 @@@
    * used for the mapping.  This is where the behaviors of x86_64 and 32
    * actually diverge.  Let's keep it ugly for now.
    */
- DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
+ DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
   
   /*
    * Knob to control our willingness to enable the local APIC.
@@@ -2123,42 -2123,6 +2123,42 @@@ void default_init_apic_ldr(void
         apic_write(APIC_LDR, val);
   }
   
+ +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
+ +                                 const struct cpumask *andmask,
+ +                                 unsigned int *apicid)
+ +{
+ +      unsigned int cpu;
+ +
+ +      for_each_cpu_and(cpu, cpumask, andmask) {
+ +              if (cpumask_test_cpu(cpu, cpu_online_mask))
+ +                      break;
+ +      }
+ +
+ +      if (likely(cpu < nr_cpu_ids)) {
+ +              *apicid = per_cpu(x86_cpu_to_apicid, cpu);
+ +              return 0;
+ +      }
+ +
+ +      return -EINVAL;
+ +}
+ +
+ +/*
+ + * Override the generic EOI implementation with an optimized version.
+ + * Only called during early boot when only one CPU is active and with
+ + * interrupts disabled, so we know this does not race with actual APIC driver
+ + * use.
+ + */
+ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
+ +{
+ +      struct apic **drv;
+ +
+ +      for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+ +              /* Should happen once for each apic */
+ +              WARN_ON((*drv)->eoi_write == eoi_write);
+ +              (*drv)->eoi_write = eoi_write;
+ +      }
+ +}
+ +
   /*
    * Power management
    */
diff --combined arch/x86/kernel/cpu/common.c

index 5bbc082c47ad8d1fef9950c5dda873f1fdd5975c,7595552600b85c8918e94a77e3043cc1e17b76e4..46d8786d655e402b702cc2f19ba4eab9cb5a62cd
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -452,6 -452,35 +452,35 @@@ void __cpuinit cpu_detect_cache_sizes(s
         c->x86_cache_size = l2size;
   }
   
+ u16 __read_mostly tlb_lli_4k[NR_INFO];
+ u16 __read_mostly tlb_lli_2m[NR_INFO];
+ u16 __read_mostly tlb_lli_4m[NR_INFO];
+ u16 __read_mostly tlb_lld_4k[NR_INFO];
+ u16 __read_mostly tlb_lld_2m[NR_INFO];
+ u16 __read_mostly tlb_lld_4m[NR_INFO];
+ 
+ /*
+  * tlb_flushall_shift shows the balance point in replacing cr3 write
+  * with multiple 'invlpg'. It will do this replacement when
+  *   flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
+  * If tlb_flushall_shift is -1, means the replacement will be disabled.
+  */
+ s8  __read_mostly tlb_flushall_shift = -1;
+ 
+ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
+ {
+       if (this_cpu->c_detect_tlb)
+               this_cpu->c_detect_tlb(c);
+ 
+       printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
+               "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"          \
+               "tlb_flushall_shift is 0x%x\n",
+               tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
+               tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
+               tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
+               tlb_flushall_shift);
+ }
+ 
   void __cpuinit detect_ht(struct cpuinfo_x86 *c)
   {
   #ifdef CONFIG_X86_HT
@@@ -911,6 -940,8 +940,8 @@@ void __init identify_boot_cpu(void
   #else
         vgetcpu_set_mode();
   #endif
+       if (boot_cpu_data.cpuid_level >= 2)
+               cpu_detect_tlb(&boot_cpu_data);
   }
   
   void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@@ -947,7 -978,7 +978,7 @@@ static void __cpuinit __print_cpu_msr(v
                 index_max = msr_range_array[i].max;
   
                 for (index = index_min; index < index_max; index++) {
- -                      if (rdmsrl_amd_safe(index, &val))
+ +                      if (rdmsrl_safe(index, &val))
                                 continue;
                         printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
                 }
diff --combined arch/x86/kernel/entry_64.S

index 111f6bbd8b38afb1a175d65dbbde7a981a0de6f7,bcf28e1ce1a700a0300d352a5aaf22bb5b70edad..69babd8c834f920b4d54c48e1f41a08d4f7fef6f
--- 1/arch/x86/kernel/entry_64.S
--- 2/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@@ -1048,24 -1048,6 +1048,6 @@@ apicinterrupt LOCAL_TIMER_VECTOR 
   apicinterrupt X86_PLATFORM_IPI_VECTOR \
         x86_platform_ipi smp_x86_platform_ipi
   
- #ifdef CONFIG_SMP
-       ALIGN
-       INTR_FRAME
- .irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-       16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
- .if NUM_INVALIDATE_TLB_VECTORS > \idx
- ENTRY(invalidate_interrupt\idx)
-       pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
-       jmp .Lcommon_invalidate_interrupt0
-       CFI_ADJUST_CFA_OFFSET -8
- END(invalidate_interrupt\idx)
- .endif
- .endr
-       CFI_ENDPROC
- apicinterrupt INVALIDATE_TLB_VECTOR_START, \
-       invalidate_interrupt0, smp_invalidate_interrupt
- #endif
- 
   apicinterrupt THRESHOLD_APIC_VECTOR \
         threshold_interrupt smp_threshold_interrupt
   apicinterrupt THERMAL_APIC_VECTOR \
@@@ -1758,30 -1740,10 +1740,30 @@@ end_repeat_nmi
          */
         call save_paranoid
         DEFAULT_FRAME 0
+ +
+ +      /*
+ +       * Save off the CR2 register. If we take a page fault in the NMI then
+ +       * it could corrupt the CR2 value. If the NMI preempts a page fault
+ +       * handler before it was able to read the CR2 register, and then the
+ +       * NMI itself takes a page fault, the page fault that was preempted
+ +       * will read the information from the NMI page fault and not the
+ +       * origin fault. Save it off and restore it if it changes.
+ +       * Use the r12 callee-saved register.
+ +       */
+ +      movq %cr2, %r12
+ +
         /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
         movq %rsp,%rdi
         movq $-1,%rsi
         call do_nmi
+ +
+ +      /* Did the NMI take a page fault? Restore cr2 if it did */
+ +      movq %cr2, %rcx
+ +      cmpq %rcx, %r12
+ +      je 1f
+ +      movq %r12, %cr2
+ +1:
+ +      
         testl %ebx,%ebx                         /* swapgs needed? */
         jnz nmi_restore
   nmi_swapgs:
diff --combined arch/x86/kernel/smpboot.c

index c1a310fb8309b471301330abeda28b040fca5824,e61110e29a8ca0e768c6b98889400047f99bf337..7c5a8c314c0268a2ba0b101802c057ad428d8b42
--- 1/arch/x86/kernel/smpboot.c
--- 2/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@@ -1,4 -1,4 +1,4 @@@
- -/*
+ + /*
    *    x86 SMP booting functions
    *
    *    (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@@ -39,8 -39,6 +39,8 @@@
    *    Glauber Costa           :       i386 and x86_64 integration
    */
   
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ +
   #include <linux/init.h>
   #include <linux/smp.h>
   #include <linux/module.h>
@@@ -106,17 -104,17 +106,17 @@@ int smp_num_siblings = 1
   EXPORT_SYMBOL(smp_num_siblings);
   
   /* Last level cache ID of each logical CPU */
- DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
+ DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
   
   /* representing HT siblings of each logical CPU */
- DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
   EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
   
   /* representing HT and core siblings of each logical CPU */
- DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
   EXPORT_PER_CPU_SYMBOL(cpu_core_map);
   
- DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
   
   /* Per CPU bogomips and other parameters */
   DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@@ -186,7 -184,7 +186,7 @@@ static void __cpuinit smp_callin(void
          * boards)
          */
   
- -      pr_debug("CALLIN, before setup_local_APIC().\n");
+ +      pr_debug("CALLIN, before setup_local_APIC()\n");
         if (apic->smp_callin_clear_local_apic)
                 apic->smp_callin_clear_local_apic();
         setup_local_APIC();
@@@ -257,13 -255,22 +257,13 @@@ notrace static void __cpuinit start_sec
         check_tsc_sync_target();
   
         /*
- -       * We need to hold call_lock, so there is no inconsistency
- -       * between the time smp_call_function() determines number of
- -       * IPI recipients, and the time when the determination is made
- -       * for which cpus receive the IPI. Holding this
- -       * lock helps us to not include this cpu in a currently in progress
- -       * smp_call_function().
- -       *
          * We need to hold vector_lock so there the set of online cpus
          * does not change while we are assigning vectors to cpus.  Holding
          * this lock ensures we don't half assign or remove an irq from a cpu.
          */
- -      ipi_call_lock();
         lock_vector_lock();
         set_cpu_online(smp_processor_id(), true);
         unlock_vector_lock();
- -      ipi_call_unlock();
         per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
         x86_platform.nmi_init();
   
@@@ -342,12 -349,9 +342,12 @@@ static bool __cpuinit match_llc(struct 
   
   static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
   {
- -      if (c->phys_proc_id == o->phys_proc_id)
- -              return topology_sane(c, o, "mc");
+ +      if (c->phys_proc_id == o->phys_proc_id) {
+ +              if (cpu_has(c, X86_FEATURE_AMD_DCM))
+ +                      return true;
   
+ +              return topology_sane(c, o, "mc");
+ +      }
         return false;
   }
   
@@@ -425,16 -429,17 +425,16 @@@ static void impress_friends(void
         /*
          * Allow the user to impress friends.
          */
- -      pr_debug("Before bogomips.\n");
+ +      pr_debug("Before bogomips\n");
         for_each_possible_cpu(cpu)
                 if (cpumask_test_cpu(cpu, cpu_callout_mask))
                         bogosum += cpu_data(cpu).loops_per_jiffy;
- -      printk(KERN_INFO
- -              "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ +      pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
                 num_online_cpus(),
                 bogosum/(500000/HZ),
                 (bogosum/(5000/HZ))%100);
   
- -      pr_debug("Before bogocount - setting activated=1.\n");
+ +      pr_debug("Before bogocount - setting activated=1\n");
   }
   
   void __inquire_remote_apic(int apicid)
@@@ -444,17 -449,18 +444,17 @@@
         int timeout;
         u32 status;
   
- -      printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
+ +      pr_info("Inquiring remote APIC 0x%x...\n", apicid);
   
         for (i = 0; i < ARRAY_SIZE(regs); i++) {
- -              printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
+ +              pr_info("... APIC 0x%x %s: ", apicid, names[i]);
   
                 /*
                  * Wait for idle.
                  */
                 status = safe_apic_wait_icr_idle();
                 if (status)
- -                      printk(KERN_CONT
- -                             "a previous APIC delivery may have failed\n");
+ +                      pr_cont("a previous APIC delivery may have failed\n");
   
                 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
   
@@@ -467,10 -473,10 +467,10 @@@
                 switch (status) {
                 case APIC_ICR_RR_VALID:
                         status = apic_read(APIC_RRR);
- -                      printk(KERN_CONT "%08x\n", status);
+ +                      pr_cont("%08x\n", status);
                         break;
                 default:
- -                      printk(KERN_CONT "failed\n");
+ +                      pr_cont("failed\n");
                 }
         }
   }
@@@ -504,12 -510,12 +504,12 @@@ wakeup_secondary_cpu_via_nmi(int logica
                         apic_write(APIC_ESR, 0);
                 accept_status = (apic_read(APIC_ESR) & 0xEF);
         }
- -      pr_debug("NMI sent.\n");
+ +      pr_debug("NMI sent\n");
   
         if (send_status)
- -              printk(KERN_ERR "APIC never delivered???\n");
+ +              pr_err("APIC never delivered???\n");
         if (accept_status)
- -              printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+ +              pr_err("APIC delivery error (%lx)\n", accept_status);
   
         return (send_status | accept_status);
   }
@@@ -531,7 -537,7 +531,7 @@@ wakeup_secondary_cpu_via_init(int phys_
                 apic_read(APIC_ESR);
         }
   
- -      pr_debug("Asserting INIT.\n");
+ +      pr_debug("Asserting INIT\n");
   
         /*
          * Turn INIT on target chip
@@@ -547,7 -553,7 +547,7 @@@
   
         mdelay(10);
   
- -      pr_debug("Deasserting INIT.\n");
+ +      pr_debug("Deasserting INIT\n");
   
         /* Target chip */
         /* Send IPI */
@@@ -580,14 -586,14 +580,14 @@@
         /*
          * Run STARTUP IPI loop.
          */
- -      pr_debug("#startup loops: %d.\n", num_starts);
+ +      pr_debug("#startup loops: %d\n", num_starts);
   
         for (j = 1; j <= num_starts; j++) {
- -              pr_debug("Sending STARTUP #%d.\n", j);
+ +              pr_debug("Sending STARTUP #%d\n", j);
                 if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
                         apic_write(APIC_ESR, 0);
                 apic_read(APIC_ESR);
- -              pr_debug("After apic_write.\n");
+ +              pr_debug("After apic_write\n");
   
                 /*
                  * STARTUP IPI
@@@ -604,7 -610,7 +604,7 @@@
                  */
                 udelay(300);
   
- -              pr_debug("Startup point 1.\n");
+ +              pr_debug("Startup point 1\n");
   
                 pr_debug("Waiting for send to finish...\n");
                 send_status = safe_apic_wait_icr_idle();
@@@ -619,12 -625,12 +619,12 @@@
                 if (send_status || accept_status)
                         break;
         }
- -      pr_debug("After Startup.\n");
+ +      pr_debug("After Startup\n");
   
         if (send_status)
- -              printk(KERN_ERR "APIC never delivered???\n");
+ +              pr_err("APIC never delivered???\n");
         if (accept_status)
- -              printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
+ +              pr_err("APIC delivery error (%lx)\n", accept_status);
   
         return (send_status | accept_status);
   }
@@@ -638,11 -644,11 +638,11 @@@ static void __cpuinit announce_cpu(int 
         if (system_state == SYSTEM_BOOTING) {
                 if (node != current_node) {
                         if (current_node > (-1))
- -                              pr_cont(" Ok.\n");
+ +                              pr_cont(" OK\n");
                         current_node = node;
                         pr_info("Booting Node %3d, Processors ", node);
                 }
- -              pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
+ +              pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
                 return;
         } else
                 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
@@@ -722,9 -728,9 +722,9 @@@ static int __cpuinit do_boot_cpu(int ap
                 /*
                  * allow APs to start initializing.
                  */
- -              pr_debug("Before Callout %d.\n", cpu);
+ +              pr_debug("Before Callout %d\n", cpu);
                 cpumask_set_cpu(cpu, cpu_callout_mask);
- -              pr_debug("After Callout %d.\n", cpu);
+ +              pr_debug("After Callout %d\n", cpu);
   
                 /*
                  * Wait 5s total for a response
@@@ -752,7 -758,7 +752,7 @@@
                                 pr_err("CPU%d: Stuck ??\n", cpu);
                         else
                                 /* trampoline code not run */
- -                              pr_err("CPU%d: Not responding.\n", cpu);
+ +                              pr_err("CPU%d: Not responding\n", cpu);
                         if (apic->inquire_remote_apic)
                                 apic->inquire_remote_apic(apicid);
                 }
@@@ -797,7 -803,7 +797,7 @@@ int __cpuinit native_cpu_up(unsigned in
         if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
             !physid_isset(apicid, phys_cpu_present_map) ||
             !apic->apic_id_valid(apicid)) {
- -              printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
+ +              pr_err("%s: bad cpu %d\n", __func__, cpu);
                 return -EINVAL;
         }
   
@@@ -878,8 -884,9 +878,8 @@@ static int __init smp_sanity_check(unsi
                 unsigned int cpu;
                 unsigned nr;
   
- -              printk(KERN_WARNING
- -                     "More than 8 CPUs detected - skipping them.\n"
- -                     "Use CONFIG_X86_BIGSMP.\n");
+ +              pr_warn("More than 8 CPUs detected - skipping them\n"
+ +                      "Use CONFIG_X86_BIGSMP\n");
   
                 nr = 0;
                 for_each_present_cpu(cpu) {
@@@ -900,7 -907,8 +900,7 @@@
   #endif
   
         if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- -              printk(KERN_WARNING
- -                      "weird, boot CPU (#%d) not listed by the BIOS.\n",
+ +              pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
                         hard_smp_processor_id());
   
                 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
@@@ -912,10 -920,11 +912,10 @@@
          */
         if (!smp_found_config && !acpi_lapic) {
                 preempt_enable();
- -              printk(KERN_NOTICE "SMP motherboard not detected.\n");
+ +              pr_notice("SMP motherboard not detected\n");
                 disable_smp();
                 if (APIC_init_uniprocessor())
- -                      printk(KERN_NOTICE "Local APIC not detected."
- -                                         " Using dummy APIC emulation.\n");
+ +                      pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
                 return -1;
         }
   
@@@ -924,8 -933,9 +924,8 @@@
          * CPU too, but we do it for the sake of robustness anyway.
          */
         if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
- -              printk(KERN_NOTICE
- -                      "weird, boot CPU (#%d) not listed by the BIOS.\n",
- -                      boot_cpu_physical_apicid);
+ +              pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
+ +                        boot_cpu_physical_apicid);
                 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
         }
         preempt_enable();
@@@ -938,7 -948,8 +938,7 @@@
                 if (!disable_apic) {
                         pr_err("BIOS bug, local APIC #%d not detected!...\n",
                                 boot_cpu_physical_apicid);
- -                      pr_err("... forcing use of dummy APIC emulation."
- -                              "(tell your hw vendor)\n");
+ +                      pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
                 }
                 smpboot_clear_io_apic();
                 disable_ioapic_support();
@@@ -951,7 -962,7 +951,7 @@@
          * If SMP should be disabled, then really disable it!
          */
         if (!max_cpus) {
- -              printk(KERN_INFO "SMP mode deactivated.\n");
+ +              pr_info("SMP mode deactivated\n");
                 smpboot_clear_io_apic();
   
                 connect_bsp_APIC();
@@@ -1003,7 -1014,7 +1003,7 @@@ void __init native_smp_prepare_cpus(uns
   
   
         if (smp_sanity_check(max_cpus) < 0) {
- -              printk(KERN_INFO "SMP disabled\n");
+ +              pr_info("SMP disabled\n");
                 disable_smp();
                 goto out;
         }
@@@ -1041,7 -1052,7 +1041,7 @@@
          * Set up local APIC timer on boot CPU.
          */
   
- -      printk(KERN_INFO "CPU%d: ", 0);
+ +      pr_info("CPU%d: ", 0);
         print_cpu_info(&cpu_data(0));
         x86_init.timers.setup_percpu_clockev();
   
@@@ -1091,7 -1102,7 +1091,7 @@@ void __init native_smp_prepare_boot_cpu
   
   void __init native_smp_cpus_done(unsigned int max_cpus)
   {
- -      pr_debug("Boot done.\n");
+ +      pr_debug("Boot done\n");
   
         nmi_selftest();
         impress_friends();
@@@ -1152,7 -1163,8 +1152,7 @@@ __init void prefill_possible_map(void
   
         /* nr_cpu_ids could be reduced via nr_cpus= */
         if (possible > nr_cpu_ids) {
- -              printk(KERN_WARNING
- -                      "%d Processors exceeds NR_CPUS limit of %d\n",
+ +              pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
                         possible, nr_cpu_ids);
                 possible = nr_cpu_ids;
         }
@@@ -1161,12 -1173,13 +1161,12 @@@
         if (!setup_max_cpus)
   #endif
         if (possible > i) {
- -              printk(KERN_WARNING
- -                      "%d Processors exceeds max_cpus limit of %u\n",
+ +              pr_warn("%d Processors exceeds max_cpus limit of %u\n",
                         possible, setup_max_cpus);
                 possible = i;
         }
   
- -      printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+ +      pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
                 possible, max_t(int, possible - num_processors, 0));
   
         for (i = 0; i < possible; i++)
diff --combined arch/x86/platform/uv/tlb_uv.c

index 71b5d5a07d7bbd7c26a5f9ae99cf5f0e8cacacbf,f1bef8e1d633ba81a02baf053b83b6e7bcd9126c..b8b3a37c80cd75e96559e67876206ad603b53741
--- 1/arch/x86/platform/uv/tlb_uv.c
--- 2/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@@ -1,7 -1,7 +1,7 @@@
   /*
    *    SGI UltraViolet TLB flush routines.
    *
- - *    (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
+ + *    (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
    *
    *    This code is released under the GNU General Public License version 2 or
    *    later.
@@@ -38,7 -38,8 +38,7 @@@ static int timeout_base_ns[] = 
   
   static int timeout_us;
   static int nobau;
- -static int baudisabled;
- -static spinlock_t disable_lock;
+ +static int nobau_perm;
   static cycles_t congested_cycles;
   
   /* tunables: */
@@@ -46,13 -47,12 +46,13 @@@ static int max_concurr             = MAX_BAU_CONCU
   static int max_concurr_const  = MAX_BAU_CONCURRENT;
   static int plugged_delay      = PLUGGED_DELAY;
   static int plugsb4reset               = PLUGSB4RESET;
+ +static int giveup_limit               = GIVEUP_LIMIT;
   static int timeoutsb4reset    = TIMEOUTSB4RESET;
   static int ipi_reset_limit    = IPI_RESET_LIMIT;
   static int complete_threshold = COMPLETE_THRESHOLD;
   static int congested_respns_us        = CONGESTED_RESPONSE_US;
   static int congested_reps     = CONGESTED_REPS;
- -static int congested_period   = CONGESTED_PERIOD;
+ +static int disabled_period    = DISABLED_PERIOD;
   
   static struct tunables tunables[] = {
         {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
@@@ -63,8 -63,7 +63,8 @@@
         {&complete_threshold, COMPLETE_THRESHOLD},
         {&congested_respns_us, CONGESTED_RESPONSE_US},
         {&congested_reps, CONGESTED_REPS},
- -      {&congested_period, CONGESTED_PERIOD}
+ +      {&disabled_period, DISABLED_PERIOD},
+ +      {&giveup_limit, GIVEUP_LIMIT}
   };
   
   static struct dentry *tunables_dir;
@@@ -121,40 -120,6 +121,40 @@@ static DEFINE_PER_CPU(struct ptc_stats
   static DEFINE_PER_CPU(struct bau_control, bau_control);
   static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
   
+ +static void
+ +set_bau_on(void)
+ +{
+ +      int cpu;
+ +      struct bau_control *bcp;
+ +
+ +      if (nobau_perm) {
+ +              pr_info("BAU not initialized; cannot be turned on\n");
+ +              return;
+ +      }
+ +      nobau = 0;
+ +      for_each_present_cpu(cpu) {
+ +              bcp = &per_cpu(bau_control, cpu);
+ +              bcp->nobau = 0;
+ +      }
+ +      pr_info("BAU turned on\n");
+ +      return;
+ +}
+ +
+ +static void
+ +set_bau_off(void)
+ +{
+ +      int cpu;
+ +      struct bau_control *bcp;
+ +
+ +      nobau = 1;
+ +      for_each_present_cpu(cpu) {
+ +              bcp = &per_cpu(bau_control, cpu);
+ +              bcp->nobau = 1;
+ +      }
+ +      pr_info("BAU turned off\n");
+ +      return;
+ +}
+ +
   /*
    * Determine the first node on a uvhub. 'Nodes' are used for kernel
    * memory allocation.
@@@ -313,7 -278,7 +313,7 @@@ static void bau_process_message(struct 
                  * Both sockets dump their completed count total into
                  * the message's count.
                  */
- -              smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+ +              *sp = 0;
                 asp = (struct atomic_short *)&msg->acknowledge_count;
                 msg_ack_count = atom_asr(socket_ack_count, asp);
   
@@@ -526,15 -491,16 +526,15 @@@ static int uv1_wait_completion(struct b
   }
   
   /*
- - * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
+ + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
+ + * But not currently used.
    */
   static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
   {
         unsigned long descriptor_status;
- -      unsigned long descriptor_status2;
   
- -      descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
- -      descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
- -      descriptor_status = (descriptor_status << 1) | descriptor_status2;
+ +      descriptor_status =
+ +              ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
         return descriptor_status;
   }
   
@@@ -565,11 -531,87 +565,11 @@@ int normal_busy(struct bau_control *bcp
    */
   int handle_uv2_busy(struct bau_control *bcp)
   {
- -      int busy_one = bcp->using_desc;
- -      int normal = bcp->uvhub_cpu;
- -      int selected = -1;
- -      int i;
- -      unsigned long descriptor_status;
- -      unsigned long status;
- -      int mmr_offset;
- -      struct bau_desc *bau_desc_old;
- -      struct bau_desc *bau_desc_new;
- -      struct bau_control *hmaster = bcp->uvhub_master;
         struct ptc_stats *stat = bcp->statp;
- -      cycles_t ttm;
   
         stat->s_uv2_wars++;
- -      spin_lock(&hmaster->uvhub_lock);
- -      /* try for the original first */
- -      if (busy_one != normal) {
- -              if (!normal_busy(bcp))
- -                      selected = normal;
- -      }
- -      if (selected < 0) {
- -              /* can't use the normal, select an alternate */
- -              mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
- -              descriptor_status = read_lmmr(mmr_offset);
- -
- -              /* scan available descriptors 32-63 */
- -              for (i = 0; i < UV_CPUS_PER_AS; i++) {
- -                      if ((hmaster->inuse_map & (1 << i)) == 0) {
- -                              status = ((descriptor_status >>
- -                                              (i * UV_ACT_STATUS_SIZE)) &
- -                                              UV_ACT_STATUS_MASK) << 1;
- -                              if (status != UV2H_DESC_BUSY) {
- -                                      selected = i + UV_CPUS_PER_AS;
- -                                      break;
- -                              }
- -                      }
- -              }
- -      }
- -
- -      if (busy_one != normal)
- -              /* mark the busy alternate as not in-use */
- -              hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
- -
- -      if (selected >= 0) {
- -              /* switch to the selected descriptor */
- -              if (selected != normal) {
- -                      /* set the selected alternate as in-use */
- -                      hmaster->inuse_map |=
- -                                      (1 << (selected - UV_CPUS_PER_AS));
- -                      if (selected > stat->s_uv2_wars_hw)
- -                              stat->s_uv2_wars_hw = selected;
- -              }
- -              bau_desc_old = bcp->descriptor_base;
- -              bau_desc_old += (ITEMS_PER_DESC * busy_one);
- -              bcp->using_desc = selected;
- -              bau_desc_new = bcp->descriptor_base;
- -              bau_desc_new += (ITEMS_PER_DESC * selected);
- -              *bau_desc_new = *bau_desc_old;
- -      } else {
- -              /*
- -               * All are busy. Wait for the normal one for this cpu to
- -               * free up.
- -               */
- -              stat->s_uv2_war_waits++;
- -              spin_unlock(&hmaster->uvhub_lock);
- -              ttm = get_cycles();
- -              do {
- -                      cpu_relax();
- -              } while (normal_busy(bcp));
- -              spin_lock(&hmaster->uvhub_lock);
- -              /* switch to the original descriptor */
- -              bcp->using_desc = normal;
- -              bau_desc_old = bcp->descriptor_base;
- -              bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
- -              bcp->using_desc = (ITEMS_PER_DESC * normal);
- -              bau_desc_new = bcp->descriptor_base;
- -              bau_desc_new += (ITEMS_PER_DESC * normal);
- -              *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
- -      }
- -      spin_unlock(&hmaster->uvhub_lock);
- -      return FLUSH_RETRY_BUSYBUG;
+ +      bcp->busy = 1;
+ +      return FLUSH_GIVEUP;
   }
   
   static int uv2_wait_completion(struct bau_desc *bau_desc,
@@@ -578,7 -620,7 +578,7 @@@
   {
         unsigned long descriptor_stat;
         cycles_t ttm;
- -      int desc = bcp->using_desc;
+ +      int desc = bcp->uvhub_cpu;
         long busy_reps = 0;
         struct ptc_stats *stat = bcp->statp;
   
@@@ -586,38 -628,24 +586,38 @@@
   
         /* spin on the status MMR, waiting for it to go idle */
         while (descriptor_stat != UV2H_DESC_IDLE) {
- -              /*
- -               * Our software ack messages may be blocked because
- -               * there are no swack resources available.  As long
- -               * as none of them has timed out hardware will NACK
- -               * our message and its state will stay IDLE.
- -               */
- -              if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
- -                  (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
+ +              if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
+ +                      /*
+ +                       * A h/w bug on the destination side may
+ +                       * have prevented the message being marked
+ +                       * pending, thus it doesn't get replied to
+ +                       * and gets continually nacked until it times
+ +                       * out with a SOURCE_TIMEOUT.
+ +                       */
                         stat->s_stimeout++;
                         return FLUSH_GIVEUP;
- -              } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
- -                      stat->s_strongnacks++;
- -                      bcp->conseccompletes = 0;
- -                      return FLUSH_GIVEUP;
                 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
+ +                      ttm = get_cycles();
+ +
+ +                      /*
+ +                       * Our retries may be blocked by all destination
+ +                       * swack resources being consumed, and a timeout
+ +                       * pending.  In that case hardware returns the
+ +                       * ERROR that looks like a destination timeout.
+ +                       * Without using the extended status we have to
+ +                       * deduce from the short time that this was a
+ +                       * strong nack.
+ +                       */
+ +                      if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
+ +                              bcp->conseccompletes = 0;
+ +                              stat->s_plugged++;
+ +                              /* FLUSH_RETRY_PLUGGED causes hang on boot */
+ +                              return FLUSH_GIVEUP;
+ +                      }
                         stat->s_dtimeout++;
                         bcp->conseccompletes = 0;
- -                      return FLUSH_RETRY_TIMEOUT;
+ +                      /* FLUSH_RETRY_TIMEOUT causes hang on boot */
+ +                      return FLUSH_GIVEUP;
                 } else {
                         busy_reps++;
                         if (busy_reps > 1000000) {
@@@ -625,8 -653,9 +625,8 @@@
                                 busy_reps = 0;
                                 ttm = get_cycles();
                                 if ((ttm - bcp->send_message) >
- -                                      (bcp->clocks_per_100_usec)) {
+ +                                              bcp->timeout_interval)
                                         return handle_uv2_busy(bcp);
- -                              }
                         }
                         /*
                          * descriptor_stat is still BUSY
@@@ -650,7 -679,7 +650,7 @@@ static int wait_completion(struct bau_d
   {
         int right_shift;
         unsigned long mmr_offset;
- -      int desc = bcp->using_desc;
+ +      int desc = bcp->uvhub_cpu;
   
         if (desc < UV_CPUS_PER_AS) {
                 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@@ -729,31 -758,33 +729,31 @@@ static void destination_timeout(struct 
   }
   
   /*
- - * Completions are taking a very long time due to a congested numalink
- - * network.
+ + * Stop all cpus on a uvhub from using the BAU for a period of time.
+ + * This is reversed by check_enable.
    */
- -static void disable_for_congestion(struct bau_control *bcp,
- -                                      struct ptc_stats *stat)
+ +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
   {
- -      /* let only one cpu do this disabling */
- -      spin_lock(&disable_lock);
- -
- -      if (!baudisabled && bcp->period_requests &&
- -          ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
- -              int tcpu;
- -              struct bau_control *tbcp;
- -              /* it becomes this cpu's job to turn on the use of the
- -                 BAU again */
- -              baudisabled = 1;
- -              bcp->set_bau_off = 1;
- -              bcp->set_bau_on_time = get_cycles();
- -              bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
+ +      int tcpu;
+ +      struct bau_control *tbcp;
+ +      struct bau_control *hmaster;
+ +      cycles_t tm1;
+ +
+ +      hmaster = bcp->uvhub_master;
+ +      spin_lock(&hmaster->disable_lock);
+ +      if (!bcp->baudisabled) {
                 stat->s_bau_disabled++;
+ +              tm1 = get_cycles();
                 for_each_present_cpu(tcpu) {
                         tbcp = &per_cpu(bau_control, tcpu);
- -                      tbcp->baudisabled = 1;
+ +                      if (tbcp->uvhub_master == hmaster) {
+ +                              tbcp->baudisabled = 1;
+ +                              tbcp->set_bau_on_time =
+ +                                      tm1 + bcp->disabled_period;
+ +                      }
                 }
         }
- -
- -      spin_unlock(&disable_lock);
+ +      spin_unlock(&hmaster->disable_lock);
   }
   
   static void count_max_concurr(int stat, struct bau_control *bcp,
@@@ -784,30 -815,16 +784,30 @@@ static void record_send_stats(cycles_t 
                         bcp->period_requests++;
                         bcp->period_time += elapsed;
                         if ((elapsed > congested_cycles) &&
- -                          (bcp->period_requests > bcp->cong_reps))
- -                              disable_for_congestion(bcp, stat);
+ +                          (bcp->period_requests > bcp->cong_reps) &&
+ +                          ((bcp->period_time / bcp->period_requests) >
+ +                                                      congested_cycles)) {
+ +                              stat->s_congested++;
+ +                              disable_for_period(bcp, stat);
+ +                      }
                 }
         } else
                 stat->s_requestor--;
   
         if (completion_status == FLUSH_COMPLETE && try > 1)
                 stat->s_retriesok++;
- -      else if (completion_status == FLUSH_GIVEUP)
+ +      else if (completion_status == FLUSH_GIVEUP) {
                 stat->s_giveup++;
+ +              if (get_cycles() > bcp->period_end)
+ +                      bcp->period_giveups = 0;
+ +              bcp->period_giveups++;
+ +              if (bcp->period_giveups == 1)
+ +                      bcp->period_end = get_cycles() + bcp->disabled_period;
+ +              if (bcp->period_giveups > bcp->giveup_limit) {
+ +                      disable_for_period(bcp, stat);
+ +                      stat->s_giveuplimit++;
+ +              }
+ +      }
   }
   
   /*
@@@ -851,8 -868,7 +851,8 @@@ static void handle_cmplt(int completion
    * Returns 1 if it gives up entirely and the original cpu mask is to be
    * returned to the kernel.
    */
- -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
+ +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
+ +      struct bau_desc *bau_desc)
   {
         int seq_number = 0;
         int completion_stat = 0;
@@@ -865,23 -881,24 +865,23 @@@
         struct bau_control *hmaster = bcp->uvhub_master;
         struct uv1_bau_msg_header *uv1_hdr = NULL;
         struct uv2_bau_msg_header *uv2_hdr = NULL;
- -      struct bau_desc *bau_desc;
   
- -      if (bcp->uvhub_version == 1)
+ +      if (bcp->uvhub_version == 1) {
+ +              uv1 = 1;
                 uv1_throttle(hmaster, stat);
+ +      }
   
         while (hmaster->uvhub_quiesce)
                 cpu_relax();
   
         time1 = get_cycles();
+ +      if (uv1)
+ +              uv1_hdr = &bau_desc->header.uv1_hdr;
+ +      else
+ +              uv2_hdr = &bau_desc->header.uv2_hdr;
+ +
         do {
- -              bau_desc = bcp->descriptor_base;
- -              bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
- -              if (bcp->uvhub_version == 1) {
- -                      uv1 = 1;
- -                      uv1_hdr = &bau_desc->header.uv1_hdr;
- -              } else
- -                      uv2_hdr = &bau_desc->header.uv2_hdr;
- -              if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
+ +              if (try == 0) {
                         if (uv1)
                                 uv1_hdr->msg_type = MSG_REGULAR;
                         else
@@@ -899,24 -916,25 +899,24 @@@
                         uv1_hdr->sequence = seq_number;
                 else
                         uv2_hdr->sequence = seq_number;
- -              index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
+ +              index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
                 bcp->send_message = get_cycles();
   
                 write_mmr_activation(index);
   
                 try++;
                 completion_stat = wait_completion(bau_desc, bcp, try);
- -              /* UV2: wait_completion() may change the bcp->using_desc */
   
                 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
   
                 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
                         bcp->ipi_attempts = 0;
+ +                      stat->s_overipilimit++;
                         completion_stat = FLUSH_GIVEUP;
                         break;
                 }
                 cpu_relax();
         } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
- -               (completion_stat == FLUSH_RETRY_BUSYBUG) ||
                  (completion_stat == FLUSH_RETRY_TIMEOUT));
   
         time2 = get_cycles();
@@@ -937,33 -955,28 +937,33 @@@
   }
   
   /*
- - * The BAU is disabled. When the disabled time period has expired, the cpu
- - * that disabled it must re-enable it.
- - * Return 0 if it is re-enabled for all cpus.
+ + * The BAU is disabled for this uvhub. When the disabled time period has
+ + * expired re-enable it.
+ + * Return 0 if it is re-enabled for all cpus on this uvhub.
    */
   static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
   {
         int tcpu;
         struct bau_control *tbcp;
+ +      struct bau_control *hmaster;
   
- -      if (bcp->set_bau_off) {
- -              if (get_cycles() >= bcp->set_bau_on_time) {
- -                      stat->s_bau_reenabled++;
- -                      baudisabled = 0;
- -                      for_each_present_cpu(tcpu) {
- -                              tbcp = &per_cpu(bau_control, tcpu);
+ +      hmaster = bcp->uvhub_master;
+ +      spin_lock(&hmaster->disable_lock);
+ +      if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
+ +              stat->s_bau_reenabled++;
+ +              for_each_present_cpu(tcpu) {
+ +                      tbcp = &per_cpu(bau_control, tcpu);
+ +                      if (tbcp->uvhub_master == hmaster) {
                                 tbcp->baudisabled = 0;
                                 tbcp->period_requests = 0;
                                 tbcp->period_time = 0;
+ +                              tbcp->period_giveups = 0;
                         }
- -                      return 0;
                 }
+ +              spin_unlock(&hmaster->disable_lock);
+ +              return 0;
         }
+ +      spin_unlock(&hmaster->disable_lock);
         return -1;
   }
   
@@@ -1055,8 -1068,8 +1055,8 @@@ static int set_distrib_bits(struct cpum
    * done.  The returned pointer is valid till preemption is re-enabled.
    */
   const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-                               struct mm_struct *mm, unsigned long va,
-                               unsigned int cpu)
+                               struct mm_struct *mm, unsigned long start,
+                               unsigned end, unsigned int cpu)
   {
         int locals = 0;
         int remotes = 0;
@@@ -1065,32 -1078,18 +1065,32 @@@
         struct cpumask *flush_mask;
         struct ptc_stats *stat;
         struct bau_control *bcp;
- -
- -      /* kernel was booted 'nobau' */
- -      if (nobau)
- -              return cpumask;
+ +      unsigned long descriptor_status;
+ +      unsigned long status;
   
         bcp = &per_cpu(bau_control, cpu);
         stat = bcp->statp;
+ +      stat->s_enters++;
+ +
+ +      if (bcp->nobau)
+ +              return cpumask;
+ +
+ +      if (bcp->busy) {
+ +              descriptor_status =
+ +                      read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
+ +              status = ((descriptor_status >> (bcp->uvhub_cpu *
+ +                      UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
+ +              if (status == UV2H_DESC_BUSY)
+ +                      return cpumask;
+ +              bcp->busy = 0;
+ +      }
   
         /* bau was disabled due to slow response */
         if (bcp->baudisabled) {
- -              if (check_enable(bcp, stat))
+ +              if (check_enable(bcp, stat)) {
+ +                      stat->s_ipifordisabled++;
                         return cpumask;
+ +              }
         }
   
         /*
@@@ -1106,40 -1105,38 +1106,40 @@@
                 stat->s_ntargself++;
   
         bau_desc = bcp->descriptor_base;
- -      bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
+ +      bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
         bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
         if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
                 return NULL;
   
         record_send_statistics(stat, locals, hubs, remotes, bau_desc);
   
-       bau_desc->payload.address = va;
+       bau_desc->payload.address = start;
         bau_desc->payload.sending_cpu = cpu;
         /*
          * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
          * or 1 if it gave up and the original cpumask should be returned.
          */
- -      if (!uv_flush_send_and_wait(flush_mask, bcp))
+ +      if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
                 return NULL;
         else
                 return cpumask;
   }
   
   /*
- - * Search the message queue for any 'other' message with the same software
- - * acknowledge resource bit vector.
+ + * Search the message queue for any 'other' unprocessed message with the
+ + * same software acknowledge resource bit vector as the 'msg' message.
    */
   struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
- -                      struct bau_control *bcp, unsigned char swack_vec)
+ +                                         struct bau_control *bcp)
   {
         struct bau_pq_entry *msg_next = msg + 1;
+ +      unsigned char swack_vec = msg->swack_vec;
   
         if (msg_next > bcp->queue_last)
                 msg_next = bcp->queue_first;
- -      while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
- -              if (msg_next->swack_vec == swack_vec)
+ +      while (msg_next != msg) {
+ +              if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
+ +                              (msg_next->swack_vec == swack_vec))
                         return msg_next;
                 msg_next++;
                 if (msg_next > bcp->queue_last)
@@@ -1168,30 -1165,32 +1168,30 @@@ void process_uv2_message(struct msg_des
                  * This message was assigned a swack resource, but no
                  * reserved acknowlegment is pending.
                  * The bug has prevented this message from setting the MMR.
- -               * And no other message has used the same sw_ack resource.
- -               * Do the requested shootdown but do not reply to the msg.
- -               * (the 0 means make no acknowledge)
                  */
- -              bau_process_message(mdp, bcp, 0);
- -              return;
- -      }
- -
- -      /*
- -       * Some message has set the MMR 'pending' bit; it might have been
- -       * another message.  Look for that message.
- -       */
- -      other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
- -      if (other_msg) {
- -              /* There is another.  Do not ack the current one. */
- -              bau_process_message(mdp, bcp, 0);
                 /*
- -               * Let the natural processing of that message acknowledge
- -               * it. Don't get the processing of sw_ack's out of order.
+ +               * Some message has set the MMR 'pending' bit; it might have
+ +               * been another message.  Look for that message.
                  */
- -              return;
+ +              other_msg = find_another_by_swack(msg, bcp);
+ +              if (other_msg) {
+ +                      /*
+ +                       * There is another. Process this one but do not
+ +                       * ack it.
+ +                       */
+ +                      bau_process_message(mdp, bcp, 0);
+ +                      /*
+ +                       * Let the natural processing of that other message
+ +                       * acknowledge it. Don't get the processing of sw_ack's
+ +                       * out of order.
+ +                       */
+ +                      return;
+ +              }
         }
   
         /*
- -       * There is no other message using this sw_ack, so it is safe to
- -       * acknowledge it.
+ +       * Either the MMR shows this one pending a reply or there is no
+ +       * other message using this sw_ack, so it is safe to acknowledge it.
          */
         bau_process_message(mdp, bcp, 1);
   
@@@ -1296,8 -1295,7 +1296,8 @@@ static void __init enable_timeouts(void
                  */
                 mmr_image |= (1L << SOFTACK_MSHIFT);
                 if (is_uv2_hub()) {
- -                      mmr_image |= (1L << UV2_EXT_SHFT);
+ +                      /* hw bug workaround; do not use extended status */
+ +                      mmr_image &= ~(1L << UV2_EXT_SHFT);
                 }
                 write_mmr_misc_control(pnode, mmr_image);
         }
@@@ -1340,34 -1338,29 +1340,34 @@@ static inline unsigned long long usec_2
   static int ptc_seq_show(struct seq_file *file, void *data)
   {
         struct ptc_stats *stat;
+ +      struct bau_control *bcp;
         int cpu;
   
         cpu = *(loff_t *)data;
         if (!cpu) {
                 seq_printf(file,
- -                      "# cpu sent stime self locals remotes ncpus localhub ");
+ +               "# cpu bauoff sent stime self locals remotes ncpus localhub ");
                 seq_printf(file,
                         "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
                 seq_printf(file,
- -                  "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok ");
+ +                      "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
+ +              seq_printf(file,
+ +                      "rok resetp resett giveup sto bz throt disable ");
                 seq_printf(file,
- -                      "resetp resett giveup sto bz throt swack recv rtime ");
+ +                      "enable wars warshw warwaits enters ipidis plugged ");
                 seq_printf(file,
- -                      "all one mult none retry canc nocan reset rcan ");
+ +                      "ipiover glim cong swack recv rtime all one mult ");
                 seq_printf(file,
- -                      "disable enable wars warshw warwaits\n");
+ +                      "none retry canc nocan reset rcan\n");
         }
         if (cpu < num_possible_cpus() && cpu_online(cpu)) {
- -              stat = &per_cpu(ptcstats, cpu);
+ +              bcp = &per_cpu(bau_control, cpu);
+ +              stat = bcp->statp;
                 /* source side statistics */
                 seq_printf(file,
- -                      "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
- -                         cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+ +                      "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+ +                         cpu, bcp->nobau, stat->s_requestor,
+ +                         cycles_2_us(stat->s_time),
                            stat->s_ntargself, stat->s_ntarglocals,
                            stat->s_ntargremotes, stat->s_ntargcpu,
                            stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
@@@ -1381,23 -1374,20 +1381,23 @@@
                            stat->s_resets_plug, stat->s_resets_timeout,
                            stat->s_giveup, stat->s_stimeout,
                            stat->s_busy, stat->s_throttles);
+ +              seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+ +                         stat->s_bau_disabled, stat->s_bau_reenabled,
+ +                         stat->s_uv2_wars, stat->s_uv2_wars_hw,
+ +                         stat->s_uv2_war_waits, stat->s_enters,
+ +                         stat->s_ipifordisabled, stat->s_plugged,
+ +                         stat->s_overipilimit, stat->s_giveuplimit,
+ +                         stat->s_congested);
   
                 /* destination side statistics */
                 seq_printf(file,
- -                         "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+ +                      "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
                            read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
                            stat->d_requestee, cycles_2_us(stat->d_time),
                            stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
                            stat->d_nomsg, stat->d_retries, stat->d_canceled,
                            stat->d_nocanceled, stat->d_resets,
                            stat->d_rcanceled);
- -              seq_printf(file, "%ld %ld %ld %ld %ld\n",
- -                      stat->s_bau_disabled, stat->s_bau_reenabled,
- -                      stat->s_uv2_wars, stat->s_uv2_wars_hw,
- -                      stat->s_uv2_war_waits);
         }
         return 0;
   }
@@@ -1411,14 -1401,13 +1411,14 @@@ static ssize_t tunables_read(struct fil
         char *buf;
         int ret;
   
- -      buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
- -              "max_concur plugged_delay plugsb4reset",
- -              "timeoutsb4reset ipi_reset_limit complete_threshold",
- -              "congested_response_us congested_reps congested_period",
+ +      buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
+ +              "max_concur plugged_delay plugsb4reset timeoutsb4reset",
+ +              "ipi_reset_limit complete_threshold congested_response_us",
+ +              "congested_reps disabled_period giveup_limit",
                 max_concurr, plugged_delay, plugsb4reset,
                 timeoutsb4reset, ipi_reset_limit, complete_threshold,
- -              congested_respns_us, congested_reps, congested_period);
+ +              congested_respns_us, congested_reps, disabled_period,
+ +              giveup_limit);
   
         if (!buf)
                 return -ENOMEM;
@@@ -1449,14 -1438,6 +1449,14 @@@ static ssize_t ptc_proc_write(struct fi
                 return -EFAULT;
         optstr[count - 1] = '\0';
   
+ +      if (!strcmp(optstr, "on")) {
+ +              set_bau_on();
+ +              return count;
+ +      } else if (!strcmp(optstr, "off")) {
+ +              set_bau_off();
+ +              return count;
+ +      }
+ +
         if (strict_strtol(optstr, 10, &input_arg) < 0) {
                 printk(KERN_DEBUG "%s is invalid\n", optstr);
                 return -EINVAL;
@@@ -1589,8 -1570,7 +1589,8 @@@ static ssize_t tunables_write(struct fi
                 bcp->complete_threshold =       complete_threshold;
                 bcp->cong_response_us =         congested_respns_us;
                 bcp->cong_reps =                congested_reps;
- -              bcp->cong_period =              congested_period;
+ +              bcp->disabled_period =          sec_2_cycles(disabled_period);
+ +              bcp->giveup_limit =             giveup_limit;
         }
         return count;
   }
@@@ -1719,10 -1699,6 +1719,10 @@@ static void activation_descriptor_init(
                          *   fairness chaining multilevel count replied_to
                          */
                 } else {
+ +                      /*
+ +                       * BIOS uses legacy mode, but UV2 hardware always
+ +                       * uses native mode for selective broadcasts.
+ +                       */
                         uv2_hdr = &bd2->header.uv2_hdr;
                         uv2_hdr->swack_flag =   1;
                         uv2_hdr->base_dest_nasid =
@@@ -1835,8 -1811,8 +1835,8 @@@ static int calculate_destination_timeou
                 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
                 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
                 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
- -              base = timeout_base_ns[index];
- -              ts_ns = base * mult1 * mult2;
+ +              ts_ns = timeout_base_ns[index];
+ +              ts_ns *= (mult1 * mult2);
                 ret = ts_ns / 1000;
         } else {
                 /* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */
@@@ -1860,8 -1836,6 +1860,8 @@@ static void __init init_per_cpu_tunable
         for_each_present_cpu(cpu) {
                 bcp = &per_cpu(bau_control, cpu);
                 bcp->baudisabled                = 0;
+ +              if (nobau)
+ +                      bcp->nobau              = 1;
                 bcp->statp                      = &per_cpu(ptcstats, cpu);
                 /* time interval to catch a hardware stay-busy bug */
                 bcp->timeout_interval           = usec_2_cycles(2*timeout_us);
@@@ -1874,11 -1848,10 +1874,11 @@@
                 bcp->complete_threshold         = complete_threshold;
                 bcp->cong_response_us           = congested_respns_us;
                 bcp->cong_reps                  = congested_reps;
- -              bcp->cong_period                = congested_period;
- -              bcp->clocks_per_100_usec =      usec_2_cycles(100);
+ +              bcp->disabled_period =          sec_2_cycles(disabled_period);
+ +              bcp->giveup_limit =             giveup_limit;
                 spin_lock_init(&bcp->queue_lock);
                 spin_lock_init(&bcp->uvhub_lock);
+ +              spin_lock_init(&bcp->disable_lock);
         }
   }
   
@@@ -1999,6 -1972,7 +1999,6 @@@ static int scan_sock(struct socket_des
                 }
                 bcp->uvhub_master = *hmasterp;
                 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
- -              bcp->using_desc = bcp->uvhub_cpu;
                 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
                         printk(KERN_EMERG "%d cpus per uvhub invalid\n",
                                 bcp->uvhub_cpu);
@@@ -2095,12 -2069,16 +2095,12 @@@ static int __init uv_bau_init(void
         if (!is_uv_system())
                 return 0;
   
- -      if (nobau)
- -              return 0;
- -
         for_each_possible_cpu(cur_cpu) {
                 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
                 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
         }
   
         nuvhubs = uv_num_possible_blades();
- -      spin_lock_init(&disable_lock);
         congested_cycles = usec_2_cycles(congested_respns_us);
   
         uv_base_pnode = 0x7fffffff;
@@@ -2113,8 -2091,7 +2113,8 @@@
         enable_timeouts();
   
         if (init_per_cpu(nuvhubs, uv_base_pnode)) {
- -              nobau = 1;
+ +              set_bau_off();
+ +              nobau_perm = 1;
                 return 0;
         }
   
diff --combined arch/x86/xen/mmu.c

index 27336dfcda8ef41e2d8874f20634d105a4084c97,39ed56789f680698fadd07a4416dc4497533ccd8..b65a76133f4f9b4f51dc426021975d7a5427191e
--- 1/arch/x86/xen/mmu.c
--- 2/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@ -308,20 -308,8 +308,20 @@@ static bool xen_batched_set_pte(pte_t *
   
   static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
   {
- -      if (!xen_batched_set_pte(ptep, pteval))
- -              native_set_pte(ptep, pteval);
+ +      if (!xen_batched_set_pte(ptep, pteval)) {
+ +              /*
+ +               * Could call native_set_pte() here and trap and
+ +               * emulate the PTE write but with 32-bit guests this
+ +               * needs two traps (one for each of the two 32-bit
+ +               * words in the PTE) so do one hypercall directly
+ +               * instead.
+ +               */
+ +              struct mmu_update u;
+ +
+ +              u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
+ +              u.val = pte_val_ma(pteval);
+ +              HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
+ +      }
   }
   
   static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@@ -1256,7 -1244,8 +1256,8 @@@ static void xen_flush_tlb_single(unsign
   }
   
   static void xen_flush_tlb_others(const struct cpumask *cpus,
-                                struct mm_struct *mm, unsigned long va)
+                                struct mm_struct *mm, unsigned long start,
+                                unsigned long end)
   {
         struct {
                 struct mmuext_op op;
@@@ -1268,7 -1257,7 +1269,7 @@@
         } *args;
         struct multicall_space mcs;
   
-       trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+       trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
   
         if (cpumask_empty(cpus))
                 return;         /* nothing to do */
@@@ -1281,11 -1270,10 +1282,10 @@@
         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
   
-       if (va == TLB_FLUSH_ALL) {
-               args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-       } else {
+       args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+       if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
                 args->op.cmd = MMUEXT_INVLPG_MULTI;
-               args->op.arg1.linear_addr = va;
+               args->op.arg1.linear_addr = start;
         }
   
         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@@ -1428,28 -1416,13 +1428,28 @@@ static pte_t __init mask_rw_pte(pte_t *
   }
   #endif /* CONFIG_X86_64 */
   
- -/* Init-time set_pte while constructing initial pagetables, which
- -   doesn't allow RO pagetable pages to be remapped RW */
+ +/*
+ + * Init-time set_pte while constructing initial pagetables, which
+ + * doesn't allow RO page table pages to be remapped RW.
+ + *
+ + * If there is no MFN for this PFN then this page is initially
+ + * ballooned out so clear the PTE (as in decrease_reservation() in
+ + * drivers/xen/balloon.c).
+ + *
+ + * Many of these PTE updates are done on unpinned and writable pages
+ + * and doing a hypercall for these is unnecessary and expensive.  At
+ + * this point it is not possible to tell if a page is pinned or not,
+ + * so always write the PTE directly and rely on Xen trapping and
+ + * emulating any updates as necessary.
+ + */
   static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
   {
- -      pte = mask_rw_pte(ptep, pte);
+ +      if (pte_mfn(pte) != INVALID_P2M_ENTRY)
+ +              pte = mask_rw_pte(ptep, pte);
+ +      else
+ +              pte = __pte_ma(0);
   
- -      xen_set_pte(ptep, pte);
+ +      native_set_pte(ptep, pte);
   }
   
   static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
diff --combined mm/memory.c

index 2466d1250231f3e2405429ea4de4a97c597d0074,32c99433cfdf7e9c2c5eb9d6ec9ac27350470e16..91f69459d3e8b3bf8075574e145788b0b40c4a6f
--- 1/mm/memory.c
--- 2/mm/memory.c
+++ b/mm/memory.c
@@@ -206,6 -206,8 +206,8 @@@ void tlb_gather_mmu(struct mmu_gather *
         tlb->mm = mm;
   
         tlb->fullmm     = fullmm;
+       tlb->start      = -1UL;
+       tlb->end        = 0;
         tlb->need_flush = 0;
         tlb->fast_mode  = (num_possible_cpus() == 1);
         tlb->local.next = NULL;
@@@ -248,6 -250,8 +250,8 @@@ void tlb_finish_mmu(struct mmu_gather *
   {
         struct mmu_gather_batch *batch, *next;
   
+       tlb->start = start;
+       tlb->end   = end;
         tlb_flush_mmu(tlb);
   
         /* keep the page table cache within bounds */
@@@ -1204,6 -1208,11 +1208,11 @@@ again
          */
         if (force_flush) {
                 force_flush = 0;
+ 
+ #ifdef HAVE_GENERIC_MMU_GATHER
+               tlb->start = addr;
+               tlb->end = end;
+ #endif
                 tlb_flush_mmu(tlb);
                 if (addr != end)
                         goto again;
@@@ -1225,15 -1234,7 +1234,15 @@@ static inline unsigned long zap_pmd_ran
                 next = pmd_addr_end(addr, end);
                 if (pmd_trans_huge(*pmd)) {
                         if (next - addr != HPAGE_PMD_SIZE) {
- -                              VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+ +#ifdef CONFIG_DEBUG_VM
+ +                              if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+ +                                      pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+ +                                              __func__, addr, end,
+ +                                              vma->vm_start,
+ +                                              vma->vm_end);
+ +                                      BUG();
+ +                              }
+ +#endif
                                 split_huge_page_pmd(vma->vm_mm, pmd);
                         } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                 goto next;
@@@ -1374,7 -1375,7 +1383,7 @@@ void unmap_vmas(struct mmu_gather *tlb
   /**
    * zap_page_range - remove user pages in a given range
    * @vma: vm_area_struct holding the applicable pages
- - * @address: starting address of pages to zap
+ + * @start: starting address of pages to zap
    * @size: number of bytes to zap
    * @details: details of nonlinear truncation or shared cache invalidation
    *
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)
		1	2
arch/x86/include/asm/apic.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/paravirt.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/paravirt_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/smp.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic/apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/entry_64.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/smpboot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/platform/uv/tlb_uv.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memory.c	patch \|	diff1 \|	diff2 \|	blob \| history