]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 26 Jul 2012 20:17:17 +0000 (13:17 -0700)
Pull x86/mm changes from Peter Anvin:
 "The big change here is the patchset by Alex Shi to use INVLPG to flush
  only the affected pages when we only need to flush a small page range.

  It also removes the special INVALIDATE_TLB_VECTOR interrupts (32
  vectors!) and replace it with an ordinary IPI function call."

Fix up trivial conflicts in arch/x86/include/asm/apic.h (added code next
to changed line)

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/tlb: Fix build warning and crash when building for !SMP
  x86/tlb: do flush_tlb_kernel_range by 'invlpg'
  x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR
  x86/tlb: enable tlb flush range support for x86
  mm/mmu_gather: enable tlb flush range in generic mmu_gather
  x86/tlb: add tlb_flushall_shift knob into debugfs
  x86/tlb: add tlb_flushall_shift for specific CPU
  x86/tlb: fall back to flush all when meet a THP large page
  x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range
  x86/tlb_info: get last level TLB entry number of CPU
  x86: Add read_mostly declaration/definition to variables from smp.h
  x86: Define early read-mostly per-cpu macros

1  2 
arch/x86/include/asm/apic.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/paravirt_types.h
arch/x86/include/asm/smp.h
arch/x86/kernel/apic/apic.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/entry_64.S
arch/x86/kernel/smpboot.c
arch/x86/platform/uv/tlb_uv.c
arch/x86/xen/mmu.c
mm/memory.c

index 3ea51a84a0e447a1644851500c3470cfd99a46c2,a907d4d251a83baceed25ac6a29689e9c49484ed..f34261296ffb71bc9f62f57ecf069508fe40d9d6
@@@ -306,8 -306,7 +306,8 @@@ struct apic 
        unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
        unsigned long (*check_apicid_present)(int apicid);
  
 -      void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
 +      void (*vector_allocation_domain)(int cpu, struct cpumask *retmask,
 +                                       const struct cpumask *mask);
        void (*init_apic_ldr)(void);
  
        void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
        unsigned long (*set_apic_id)(unsigned int id);
        unsigned long apic_id_mask;
  
 -      unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask);
 -      unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
 -                                             const struct cpumask *andmask);
 +      int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask,
 +                                    const struct cpumask *andmask,
 +                                    unsigned int *apicid);
  
        /* ipi */
        void (*send_IPI_mask)(const struct cpumask *mask, int vector);
@@@ -465,8 -464,6 +465,8 @@@ static inline u32 safe_apic_wait_icr_id
        return apic->safe_wait_icr_idle();
  }
  
 +extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v));
 +
  #else /* CONFIG_X86_LOCAL_APIC */
  
  static inline u32 apic_read(u32 reg) { return 0; }
@@@ -476,7 -473,6 +476,7 @@@ static inline u64 apic_icr_read(void) 
  static inline void apic_icr_write(u32 low, u32 high) { }
  static inline void apic_wait_icr_idle(void) { }
  static inline u32 safe_apic_wait_icr_idle(void) { return 0; }
 +static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {}
  
  #endif /* CONFIG_X86_LOCAL_APIC */
  
@@@ -541,12 -537,7 +541,12 @@@ static inline const struct cpumask *def
  #endif
  }
  
- DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
 +static inline const struct cpumask *online_target_cpus(void)
 +{
 +      return cpu_online_mask;
 +}
 +
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
  
  
  static inline unsigned int read_apic_id(void)
@@@ -595,50 -586,21 +595,50 @@@ static inline int default_phys_pkg_id(i
  
  #endif
  
 -static inline unsigned int
 -default_cpu_mask_to_apicid(const struct cpumask *cpumask)
 +static inline int
 +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 +                          const struct cpumask *andmask,
 +                          unsigned int *apicid)
  {
 -      return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS;
 +      unsigned long cpu_mask = cpumask_bits(cpumask)[0] &
 +                               cpumask_bits(andmask)[0] &
 +                               cpumask_bits(cpu_online_mask)[0] &
 +                               APIC_ALL_CPUS;
 +
 +      if (likely(cpu_mask)) {
 +              *apicid = (unsigned int)cpu_mask;
 +              return 0;
 +      } else {
 +              return -EINVAL;
 +      }
  }
  
 -static inline unsigned int
 +extern int
  default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 -                             const struct cpumask *andmask)
 +                             const struct cpumask *andmask,
 +                             unsigned int *apicid);
 +
 +static inline void
 +flat_vector_allocation_domain(int cpu, struct cpumask *retmask,
 +                            const struct cpumask *mask)
  {
 -      unsigned long mask1 = cpumask_bits(cpumask)[0];
 -      unsigned long mask2 = cpumask_bits(andmask)[0];
 -      unsigned long mask3 = cpumask_bits(cpu_online_mask)[0];
 +      /* Careful. Some cpus do not strictly honor the set of cpus
 +       * specified in the interrupt destination when using lowest
 +       * priority interrupt delivery mode.
 +       *
 +       * In particular there was a hyperthreading cpu observed to
 +       * deliver interrupts to the wrong hyperthread when only one
 +       * hyperthread was specified in the interrupt desitination.
 +       */
 +      cpumask_clear(retmask);
 +      cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
 +}
  
 -      return (unsigned int)(mask1 & mask2 & mask3);
 +static inline void
 +default_vector_allocation_domain(int cpu, struct cpumask *retmask,
 +                               const struct cpumask *mask)
 +{
 +      cpumask_copy(retmask, cpumask_of(cpu));
  }
  
  static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
index 0b47ddb6f00b300366253a343c4a78424a8fc280,7e2c2a6357374ed50bcaafcc43606430411cfc1d..a0facf3908d7fc0e0762d968d5ec463f7a68b057
@@@ -128,11 -128,21 +128,11 @@@ static inline u64 paravirt_read_msr(uns
        return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
  }
  
 -static inline int paravirt_rdmsr_regs(u32 *regs)
 -{
 -      return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs);
 -}
 -
  static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
  {
        return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
  }
  
 -static inline int paravirt_wrmsr_regs(u32 *regs)
 -{
 -      return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs);
 -}
 -
  /* These should all do BUG_ON(_err), but our headers are too tangled. */
  #define rdmsr(msr, val1, val2)                        \
  do {                                          \
@@@ -166,6 -176,9 +166,6 @@@ do {                                               
        _err;                                   \
  })
  
 -#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs)
 -#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs)
 -
  static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
  {
        int err;
        *p = paravirt_read_msr(msr, &err);
        return err;
  }
 -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
 -{
 -      u32 gprs[8] = { 0 };
 -      int err;
 -
 -      gprs[1] = msr;
 -      gprs[7] = 0x9c5a203a;
 -
 -      err = paravirt_rdmsr_regs(gprs);
 -
 -      *p = gprs[0] | ((u64)gprs[2] << 32);
 -
 -      return err;
 -}
 -
 -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
 -{
 -      u32 gprs[8] = { 0 };
 -
 -      gprs[0] = (u32)val;
 -      gprs[1] = msr;
 -      gprs[2] = val >> 32;
 -      gprs[7] = 0x9c5a203a;
 -
 -      return paravirt_wrmsr_regs(gprs);
 -}
  
  static inline u64 paravirt_read_tsc(void)
  {
@@@ -213,8 -252,6 +213,8 @@@ do {                                               
        high = _l >> 32;                        \
  } while (0)
  
 +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 +
  static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
  {
        return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
@@@ -360,9 -397,10 +360,10 @@@ static inline void __flush_tlb_single(u
  
  static inline void flush_tlb_others(const struct cpumask *cpumask,
                                    struct mm_struct *mm,
-                                   unsigned long va)
+                                   unsigned long start,
+                                   unsigned long end)
  {
-       PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, cpumask, mm, va);
+       PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
  }
  
  static inline int paravirt_pgd_alloc(struct mm_struct *mm)
index 8613cbb7ba41e63d8626e2bae1356793ca279a1d,600a5fcac9cd1e3c4551313163ae7ebc897f7c13..142236ed83af580c06e2e00893501e3fc38b0e65
@@@ -153,7 -153,9 +153,7 @@@ struct pv_cpu_ops 
        /* MSR, PMC and TSR operations.
           err = 0/-EFAULT.  wrmsr returns 0/-EFAULT. */
        u64 (*read_msr)(unsigned int msr, int *err);
 -      int (*rdmsr_regs)(u32 *regs);
        int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
 -      int (*wrmsr_regs)(u32 *regs);
  
        u64 (*read_tsc)(void);
        u64 (*read_pmc)(int counter);
@@@ -248,7 -250,8 +248,8 @@@ struct pv_mmu_ops 
        void (*flush_tlb_single)(unsigned long addr);
        void (*flush_tlb_others)(const struct cpumask *cpus,
                                 struct mm_struct *mm,
-                                unsigned long va);
+                                unsigned long start,
+                                unsigned long end);
  
        /* Hooks for allocating and freeing a pagetable top-level */
        int  (*pgd_alloc)(struct mm_struct *mm);
index 2ffa95dc2333bcc5a36efb898fd7ad12e95f5842,cc1df2b5cc65afcbd6f89dc252a3fad48ce6a5b1..4f19a1526037364dd9a0b5f41f0125ad52d15988
@@@ -31,12 -31,12 +31,12 @@@ static inline bool cpu_has_ht_siblings(
        return has_siblings;
  }
  
- DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
- DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
+ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
+ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
  /* cpus sharing the last level cache: */
- DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
- DECLARE_PER_CPU(u16, cpu_llc_id);
- DECLARE_PER_CPU(int, cpu_number);
+ DECLARE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
+ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
+ DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
  
  static inline struct cpumask *cpu_sibling_mask(int cpu)
  {
@@@ -53,10 -53,10 +53,10 @@@ static inline struct cpumask *cpu_llc_s
        return per_cpu(cpu_llc_shared_map, cpu);
  }
  
- DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
- DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
  #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
- DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
+ DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid);
  #endif
  
  /* Static state in head.S used to set up a CPU */
@@@ -169,6 -169,11 +169,6 @@@ void x86_idle_thread_init(unsigned int 
  void smp_store_cpu_info(int id);
  #define cpu_physical_id(cpu)  per_cpu(x86_cpu_to_apicid, cpu)
  
 -/* We don't mark CPUs online until __cpu_up(), so we need another measure */
 -static inline int num_booting_cpus(void)
 -{
 -      return cpumask_weight(cpu_callout_mask);
 -}
  #else /* !CONFIG_SMP */
  #define wbinvd_on_cpu(cpu)     wbinvd()
  static inline int wbinvd_on_all_cpus(void)
index 98e24131ff3a831fce89d957fe8b3cb9740264dd,0443b6482214dd1c081f69d34bdb297b63512e6e..24deb308232824225e1e7f8d3933ea694778da4d
@@@ -75,8 -75,8 +75,8 @@@ physid_mask_t phys_cpu_present_map
  /*
   * Map cpu index to physical APIC ID
   */
- DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
- DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
+ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
+ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID);
  EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
  EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
  
@@@ -88,7 -88,7 +88,7 @@@
   * used for the mapping.  This is where the behaviors of x86_64 and 32
   * actually diverge.  Let's keep it ugly for now.
   */
- DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
+ DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID);
  
  /*
   * Knob to control our willingness to enable the local APIC.
@@@ -2123,42 -2123,6 +2123,42 @@@ void default_init_apic_ldr(void
        apic_write(APIC_LDR, val);
  }
  
 +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 +                                 const struct cpumask *andmask,
 +                                 unsigned int *apicid)
 +{
 +      unsigned int cpu;
 +
 +      for_each_cpu_and(cpu, cpumask, andmask) {
 +              if (cpumask_test_cpu(cpu, cpu_online_mask))
 +                      break;
 +      }
 +
 +      if (likely(cpu < nr_cpu_ids)) {
 +              *apicid = per_cpu(x86_cpu_to_apicid, cpu);
 +              return 0;
 +      }
 +
 +      return -EINVAL;
 +}
 +
 +/*
 + * Override the generic EOI implementation with an optimized version.
 + * Only called during early boot when only one CPU is active and with
 + * interrupts disabled, so we know this does not race with actual APIC driver
 + * use.
 + */
 +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
 +{
 +      struct apic **drv;
 +
 +      for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
 +              /* Should happen once for each apic */
 +              WARN_ON((*drv)->eoi_write == eoi_write);
 +              (*drv)->eoi_write = eoi_write;
 +      }
 +}
 +
  /*
   * Power management
   */
index 5bbc082c47ad8d1fef9950c5dda873f1fdd5975c,7595552600b85c8918e94a77e3043cc1e17b76e4..46d8786d655e402b702cc2f19ba4eab9cb5a62cd
@@@ -452,6 -452,35 +452,35 @@@ void __cpuinit cpu_detect_cache_sizes(s
        c->x86_cache_size = l2size;
  }
  
+ u16 __read_mostly tlb_lli_4k[NR_INFO];
+ u16 __read_mostly tlb_lli_2m[NR_INFO];
+ u16 __read_mostly tlb_lli_4m[NR_INFO];
+ u16 __read_mostly tlb_lld_4k[NR_INFO];
+ u16 __read_mostly tlb_lld_2m[NR_INFO];
+ u16 __read_mostly tlb_lld_4m[NR_INFO];
+ /*
+  * tlb_flushall_shift shows the balance point in replacing cr3 write
+  * with multiple 'invlpg'. It will do this replacement when
+  *   flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
+  * If tlb_flushall_shift is -1, means the replacement will be disabled.
+  */
+ s8  __read_mostly tlb_flushall_shift = -1;
+ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
+ {
+       if (this_cpu->c_detect_tlb)
+               this_cpu->c_detect_tlb(c);
+       printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
+               "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"          \
+               "tlb_flushall_shift is 0x%x\n",
+               tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
+               tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
+               tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
+               tlb_flushall_shift);
+ }
  void __cpuinit detect_ht(struct cpuinfo_x86 *c)
  {
  #ifdef CONFIG_X86_HT
@@@ -911,6 -940,8 +940,8 @@@ void __init identify_boot_cpu(void
  #else
        vgetcpu_set_mode();
  #endif
+       if (boot_cpu_data.cpuid_level >= 2)
+               cpu_detect_tlb(&boot_cpu_data);
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@@ -947,7 -978,7 +978,7 @@@ static void __cpuinit __print_cpu_msr(v
                index_max = msr_range_array[i].max;
  
                for (index = index_min; index < index_max; index++) {
 -                      if (rdmsrl_amd_safe(index, &val))
 +                      if (rdmsrl_safe(index, &val))
                                continue;
                        printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
                }
index 111f6bbd8b38afb1a175d65dbbde7a981a0de6f7,bcf28e1ce1a700a0300d352a5aaf22bb5b70edad..69babd8c834f920b4d54c48e1f41a08d4f7fef6f
@@@ -1048,24 -1048,6 +1048,6 @@@ apicinterrupt LOCAL_TIMER_VECTOR 
  apicinterrupt X86_PLATFORM_IPI_VECTOR \
        x86_platform_ipi smp_x86_platform_ipi
  
- #ifdef CONFIG_SMP
-       ALIGN
-       INTR_FRAME
- .irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
-       16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
- .if NUM_INVALIDATE_TLB_VECTORS > \idx
- ENTRY(invalidate_interrupt\idx)
-       pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx)
-       jmp .Lcommon_invalidate_interrupt0
-       CFI_ADJUST_CFA_OFFSET -8
- END(invalidate_interrupt\idx)
- .endif
- .endr
-       CFI_ENDPROC
- apicinterrupt INVALIDATE_TLB_VECTOR_START, \
-       invalidate_interrupt0, smp_invalidate_interrupt
- #endif
  apicinterrupt THRESHOLD_APIC_VECTOR \
        threshold_interrupt smp_threshold_interrupt
  apicinterrupt THERMAL_APIC_VECTOR \
@@@ -1758,30 -1740,10 +1740,30 @@@ end_repeat_nmi
         */
        call save_paranoid
        DEFAULT_FRAME 0
 +
 +      /*
 +       * Save off the CR2 register. If we take a page fault in the NMI then
 +       * it could corrupt the CR2 value. If the NMI preempts a page fault
 +       * handler before it was able to read the CR2 register, and then the
 +       * NMI itself takes a page fault, the page fault that was preempted
 +       * will read the information from the NMI page fault and not the
 +       * origin fault. Save it off and restore it if it changes.
 +       * Use the r12 callee-saved register.
 +       */
 +      movq %cr2, %r12
 +
        /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
        movq %rsp,%rdi
        movq $-1,%rsi
        call do_nmi
 +
 +      /* Did the NMI take a page fault? Restore cr2 if it did */
 +      movq %cr2, %rcx
 +      cmpq %rcx, %r12
 +      je 1f
 +      movq %r12, %cr2
 +1:
 +      
        testl %ebx,%ebx                         /* swapgs needed? */
        jnz nmi_restore
  nmi_swapgs:
index c1a310fb8309b471301330abeda28b040fca5824,e61110e29a8ca0e768c6b98889400047f99bf337..7c5a8c314c0268a2ba0b101802c057ad428d8b42
@@@ -1,4 -1,4 +1,4 @@@
 -/*
 + /*
   *    x86 SMP booting functions
   *
   *    (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@@ -39,8 -39,6 +39,8 @@@
   *    Glauber Costa           :       i386 and x86_64 integration
   */
  
 +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 +
  #include <linux/init.h>
  #include <linux/smp.h>
  #include <linux/module.h>
@@@ -106,17 -104,17 +106,17 @@@ int smp_num_siblings = 1
  EXPORT_SYMBOL(smp_num_siblings);
  
  /* Last level cache ID of each logical CPU */
- DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
+ DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID;
  
  /* representing HT siblings of each logical CPU */
- DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map);
  EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
  
  /* representing HT and core siblings of each logical CPU */
- DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map);
  EXPORT_PER_CPU_SYMBOL(cpu_core_map);
  
- DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
+ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
  
  /* Per CPU bogomips and other parameters */
  DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@@ -186,7 -184,7 +186,7 @@@ static void __cpuinit smp_callin(void
         * boards)
         */
  
 -      pr_debug("CALLIN, before setup_local_APIC().\n");
 +      pr_debug("CALLIN, before setup_local_APIC()\n");
        if (apic->smp_callin_clear_local_apic)
                apic->smp_callin_clear_local_apic();
        setup_local_APIC();
@@@ -257,13 -255,22 +257,13 @@@ notrace static void __cpuinit start_sec
        check_tsc_sync_target();
  
        /*
 -       * We need to hold call_lock, so there is no inconsistency
 -       * between the time smp_call_function() determines number of
 -       * IPI recipients, and the time when the determination is made
 -       * for which cpus receive the IPI. Holding this
 -       * lock helps us to not include this cpu in a currently in progress
 -       * smp_call_function().
 -       *
         * We need to hold vector_lock so there the set of online cpus
         * does not change while we are assigning vectors to cpus.  Holding
         * this lock ensures we don't half assign or remove an irq from a cpu.
         */
 -      ipi_call_lock();
        lock_vector_lock();
        set_cpu_online(smp_processor_id(), true);
        unlock_vector_lock();
 -      ipi_call_unlock();
        per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
        x86_platform.nmi_init();
  
@@@ -342,12 -349,9 +342,12 @@@ static bool __cpuinit match_llc(struct 
  
  static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
  {
 -      if (c->phys_proc_id == o->phys_proc_id)
 -              return topology_sane(c, o, "mc");
 +      if (c->phys_proc_id == o->phys_proc_id) {
 +              if (cpu_has(c, X86_FEATURE_AMD_DCM))
 +                      return true;
  
 +              return topology_sane(c, o, "mc");
 +      }
        return false;
  }
  
@@@ -425,16 -429,17 +425,16 @@@ static void impress_friends(void
        /*
         * Allow the user to impress friends.
         */
 -      pr_debug("Before bogomips.\n");
 +      pr_debug("Before bogomips\n");
        for_each_possible_cpu(cpu)
                if (cpumask_test_cpu(cpu, cpu_callout_mask))
                        bogosum += cpu_data(cpu).loops_per_jiffy;
 -      printk(KERN_INFO
 -              "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
 +      pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n",
                num_online_cpus(),
                bogosum/(500000/HZ),
                (bogosum/(5000/HZ))%100);
  
 -      pr_debug("Before bogocount - setting activated=1.\n");
 +      pr_debug("Before bogocount - setting activated=1\n");
  }
  
  void __inquire_remote_apic(int apicid)
        int timeout;
        u32 status;
  
 -      printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
 +      pr_info("Inquiring remote APIC 0x%x...\n", apicid);
  
        for (i = 0; i < ARRAY_SIZE(regs); i++) {
 -              printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
 +              pr_info("... APIC 0x%x %s: ", apicid, names[i]);
  
                /*
                 * Wait for idle.
                 */
                status = safe_apic_wait_icr_idle();
                if (status)
 -                      printk(KERN_CONT
 -                             "a previous APIC delivery may have failed\n");
 +                      pr_cont("a previous APIC delivery may have failed\n");
  
                apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
  
                switch (status) {
                case APIC_ICR_RR_VALID:
                        status = apic_read(APIC_RRR);
 -                      printk(KERN_CONT "%08x\n", status);
 +                      pr_cont("%08x\n", status);
                        break;
                default:
 -                      printk(KERN_CONT "failed\n");
 +                      pr_cont("failed\n");
                }
        }
  }
@@@ -504,12 -510,12 +504,12 @@@ wakeup_secondary_cpu_via_nmi(int logica
                        apic_write(APIC_ESR, 0);
                accept_status = (apic_read(APIC_ESR) & 0xEF);
        }
 -      pr_debug("NMI sent.\n");
 +      pr_debug("NMI sent\n");
  
        if (send_status)
 -              printk(KERN_ERR "APIC never delivered???\n");
 +              pr_err("APIC never delivered???\n");
        if (accept_status)
 -              printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
 +              pr_err("APIC delivery error (%lx)\n", accept_status);
  
        return (send_status | accept_status);
  }
@@@ -531,7 -537,7 +531,7 @@@ wakeup_secondary_cpu_via_init(int phys_
                apic_read(APIC_ESR);
        }
  
 -      pr_debug("Asserting INIT.\n");
 +      pr_debug("Asserting INIT\n");
  
        /*
         * Turn INIT on target chip
  
        mdelay(10);
  
 -      pr_debug("Deasserting INIT.\n");
 +      pr_debug("Deasserting INIT\n");
  
        /* Target chip */
        /* Send IPI */
        /*
         * Run STARTUP IPI loop.
         */
 -      pr_debug("#startup loops: %d.\n", num_starts);
 +      pr_debug("#startup loops: %d\n", num_starts);
  
        for (j = 1; j <= num_starts; j++) {
 -              pr_debug("Sending STARTUP #%d.\n", j);
 +              pr_debug("Sending STARTUP #%d\n", j);
                if (maxlvt > 3)         /* Due to the Pentium erratum 3AP.  */
                        apic_write(APIC_ESR, 0);
                apic_read(APIC_ESR);
 -              pr_debug("After apic_write.\n");
 +              pr_debug("After apic_write\n");
  
                /*
                 * STARTUP IPI
                 */
                udelay(300);
  
 -              pr_debug("Startup point 1.\n");
 +              pr_debug("Startup point 1\n");
  
                pr_debug("Waiting for send to finish...\n");
                send_status = safe_apic_wait_icr_idle();
                if (send_status || accept_status)
                        break;
        }
 -      pr_debug("After Startup.\n");
 +      pr_debug("After Startup\n");
  
        if (send_status)
 -              printk(KERN_ERR "APIC never delivered???\n");
 +              pr_err("APIC never delivered???\n");
        if (accept_status)
 -              printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
 +              pr_err("APIC delivery error (%lx)\n", accept_status);
  
        return (send_status | accept_status);
  }
@@@ -638,11 -644,11 +638,11 @@@ static void __cpuinit announce_cpu(int 
        if (system_state == SYSTEM_BOOTING) {
                if (node != current_node) {
                        if (current_node > (-1))
 -                              pr_cont(" Ok.\n");
 +                              pr_cont(" OK\n");
                        current_node = node;
                        pr_info("Booting Node %3d, Processors ", node);
                }
 -              pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
 +              pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : "");
                return;
        } else
                pr_info("Booting Node %d Processor %d APIC 0x%x\n",
@@@ -722,9 -728,9 +722,9 @@@ static int __cpuinit do_boot_cpu(int ap
                /*
                 * allow APs to start initializing.
                 */
 -              pr_debug("Before Callout %d.\n", cpu);
 +              pr_debug("Before Callout %d\n", cpu);
                cpumask_set_cpu(cpu, cpu_callout_mask);
 -              pr_debug("After Callout %d.\n", cpu);
 +              pr_debug("After Callout %d\n", cpu);
  
                /*
                 * Wait 5s total for a response
                                pr_err("CPU%d: Stuck ??\n", cpu);
                        else
                                /* trampoline code not run */
 -                              pr_err("CPU%d: Not responding.\n", cpu);
 +                              pr_err("CPU%d: Not responding\n", cpu);
                        if (apic->inquire_remote_apic)
                                apic->inquire_remote_apic(apicid);
                }
@@@ -797,7 -803,7 +797,7 @@@ int __cpuinit native_cpu_up(unsigned in
        if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
            !physid_isset(apicid, phys_cpu_present_map) ||
            !apic->apic_id_valid(apicid)) {
 -              printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
 +              pr_err("%s: bad cpu %d\n", __func__, cpu);
                return -EINVAL;
        }
  
@@@ -878,8 -884,9 +878,8 @@@ static int __init smp_sanity_check(unsi
                unsigned int cpu;
                unsigned nr;
  
 -              printk(KERN_WARNING
 -                     "More than 8 CPUs detected - skipping them.\n"
 -                     "Use CONFIG_X86_BIGSMP.\n");
 +              pr_warn("More than 8 CPUs detected - skipping them\n"
 +                      "Use CONFIG_X86_BIGSMP\n");
  
                nr = 0;
                for_each_present_cpu(cpu) {
  #endif
  
        if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
 -              printk(KERN_WARNING
 -                      "weird, boot CPU (#%d) not listed by the BIOS.\n",
 +              pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n",
                        hard_smp_processor_id());
  
                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
         */
        if (!smp_found_config && !acpi_lapic) {
                preempt_enable();
 -              printk(KERN_NOTICE "SMP motherboard not detected.\n");
 +              pr_notice("SMP motherboard not detected\n");
                disable_smp();
                if (APIC_init_uniprocessor())
 -                      printk(KERN_NOTICE "Local APIC not detected."
 -                                         " Using dummy APIC emulation.\n");
 +                      pr_notice("Local APIC not detected. Using dummy APIC emulation.\n");
                return -1;
        }
  
         * CPU too, but we do it for the sake of robustness anyway.
         */
        if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
 -              printk(KERN_NOTICE
 -                      "weird, boot CPU (#%d) not listed by the BIOS.\n",
 -                      boot_cpu_physical_apicid);
 +              pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n",
 +                        boot_cpu_physical_apicid);
                physid_set(hard_smp_processor_id(), phys_cpu_present_map);
        }
        preempt_enable();
                if (!disable_apic) {
                        pr_err("BIOS bug, local APIC #%d not detected!...\n",
                                boot_cpu_physical_apicid);
 -                      pr_err("... forcing use of dummy APIC emulation."
 -                              "(tell your hw vendor)\n");
 +                      pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n");
                }
                smpboot_clear_io_apic();
                disable_ioapic_support();
         * If SMP should be disabled, then really disable it!
         */
        if (!max_cpus) {
 -              printk(KERN_INFO "SMP mode deactivated.\n");
 +              pr_info("SMP mode deactivated\n");
                smpboot_clear_io_apic();
  
                connect_bsp_APIC();
@@@ -1003,7 -1014,7 +1003,7 @@@ void __init native_smp_prepare_cpus(uns
  
  
        if (smp_sanity_check(max_cpus) < 0) {
 -              printk(KERN_INFO "SMP disabled\n");
 +              pr_info("SMP disabled\n");
                disable_smp();
                goto out;
        }
         * Set up local APIC timer on boot CPU.
         */
  
 -      printk(KERN_INFO "CPU%d: ", 0);
 +      pr_info("CPU%d: ", 0);
        print_cpu_info(&cpu_data(0));
        x86_init.timers.setup_percpu_clockev();
  
@@@ -1091,7 -1102,7 +1091,7 @@@ void __init native_smp_prepare_boot_cpu
  
  void __init native_smp_cpus_done(unsigned int max_cpus)
  {
 -      pr_debug("Boot done.\n");
 +      pr_debug("Boot done\n");
  
        nmi_selftest();
        impress_friends();
@@@ -1152,7 -1163,8 +1152,7 @@@ __init void prefill_possible_map(void
  
        /* nr_cpu_ids could be reduced via nr_cpus= */
        if (possible > nr_cpu_ids) {
 -              printk(KERN_WARNING
 -                      "%d Processors exceeds NR_CPUS limit of %d\n",
 +              pr_warn("%d Processors exceeds NR_CPUS limit of %d\n",
                        possible, nr_cpu_ids);
                possible = nr_cpu_ids;
        }
        if (!setup_max_cpus)
  #endif
        if (possible > i) {
 -              printk(KERN_WARNING
 -                      "%d Processors exceeds max_cpus limit of %u\n",
 +              pr_warn("%d Processors exceeds max_cpus limit of %u\n",
                        possible, setup_max_cpus);
                possible = i;
        }
  
 -      printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
 +      pr_info("Allowing %d CPUs, %d hotplug CPUs\n",
                possible, max_t(int, possible - num_processors, 0));
  
        for (i = 0; i < possible; i++)
index 71b5d5a07d7bbd7c26a5f9ae99cf5f0e8cacacbf,f1bef8e1d633ba81a02baf053b83b6e7bcd9126c..b8b3a37c80cd75e96559e67876206ad603b53741
@@@ -1,7 -1,7 +1,7 @@@
  /*
   *    SGI UltraViolet TLB flush routines.
   *
 - *    (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
 + *    (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI.
   *
   *    This code is released under the GNU General Public License version 2 or
   *    later.
@@@ -38,7 -38,8 +38,7 @@@ static int timeout_base_ns[] = 
  
  static int timeout_us;
  static int nobau;
 -static int baudisabled;
 -static spinlock_t disable_lock;
 +static int nobau_perm;
  static cycles_t congested_cycles;
  
  /* tunables: */
@@@ -46,13 -47,12 +46,13 @@@ static int max_concurr             = MAX_BAU_CONCU
  static int max_concurr_const  = MAX_BAU_CONCURRENT;
  static int plugged_delay      = PLUGGED_DELAY;
  static int plugsb4reset               = PLUGSB4RESET;
 +static int giveup_limit               = GIVEUP_LIMIT;
  static int timeoutsb4reset    = TIMEOUTSB4RESET;
  static int ipi_reset_limit    = IPI_RESET_LIMIT;
  static int complete_threshold = COMPLETE_THRESHOLD;
  static int congested_respns_us        = CONGESTED_RESPONSE_US;
  static int congested_reps     = CONGESTED_REPS;
 -static int congested_period   = CONGESTED_PERIOD;
 +static int disabled_period    = DISABLED_PERIOD;
  
  static struct tunables tunables[] = {
        {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
@@@ -63,8 -63,7 +63,8 @@@
        {&complete_threshold, COMPLETE_THRESHOLD},
        {&congested_respns_us, CONGESTED_RESPONSE_US},
        {&congested_reps, CONGESTED_REPS},
 -      {&congested_period, CONGESTED_PERIOD}
 +      {&disabled_period, DISABLED_PERIOD},
 +      {&giveup_limit, GIVEUP_LIMIT}
  };
  
  static struct dentry *tunables_dir;
@@@ -121,40 -120,6 +121,40 @@@ static DEFINE_PER_CPU(struct ptc_stats
  static DEFINE_PER_CPU(struct bau_control, bau_control);
  static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
  
 +static void
 +set_bau_on(void)
 +{
 +      int cpu;
 +      struct bau_control *bcp;
 +
 +      if (nobau_perm) {
 +              pr_info("BAU not initialized; cannot be turned on\n");
 +              return;
 +      }
 +      nobau = 0;
 +      for_each_present_cpu(cpu) {
 +              bcp = &per_cpu(bau_control, cpu);
 +              bcp->nobau = 0;
 +      }
 +      pr_info("BAU turned on\n");
 +      return;
 +}
 +
 +static void
 +set_bau_off(void)
 +{
 +      int cpu;
 +      struct bau_control *bcp;
 +
 +      nobau = 1;
 +      for_each_present_cpu(cpu) {
 +              bcp = &per_cpu(bau_control, cpu);
 +              bcp->nobau = 1;
 +      }
 +      pr_info("BAU turned off\n");
 +      return;
 +}
 +
  /*
   * Determine the first node on a uvhub. 'Nodes' are used for kernel
   * memory allocation.
@@@ -313,7 -278,7 +313,7 @@@ static void bau_process_message(struct 
                 * Both sockets dump their completed count total into
                 * the message's count.
                 */
 -              smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
 +              *sp = 0;
                asp = (struct atomic_short *)&msg->acknowledge_count;
                msg_ack_count = atom_asr(socket_ack_count, asp);
  
@@@ -526,15 -491,16 +526,15 @@@ static int uv1_wait_completion(struct b
  }
  
  /*
 - * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
 + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register.
 + * But not currently used.
   */
  static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
  {
        unsigned long descriptor_status;
 -      unsigned long descriptor_status2;
  
 -      descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
 -      descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
 -      descriptor_status = (descriptor_status << 1) | descriptor_status2;
 +      descriptor_status =
 +              ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1;
        return descriptor_status;
  }
  
@@@ -565,11 -531,87 +565,11 @@@ int normal_busy(struct bau_control *bcp
   */
  int handle_uv2_busy(struct bau_control *bcp)
  {
 -      int busy_one = bcp->using_desc;
 -      int normal = bcp->uvhub_cpu;
 -      int selected = -1;
 -      int i;
 -      unsigned long descriptor_status;
 -      unsigned long status;
 -      int mmr_offset;
 -      struct bau_desc *bau_desc_old;
 -      struct bau_desc *bau_desc_new;
 -      struct bau_control *hmaster = bcp->uvhub_master;
        struct ptc_stats *stat = bcp->statp;
 -      cycles_t ttm;
  
        stat->s_uv2_wars++;
 -      spin_lock(&hmaster->uvhub_lock);
 -      /* try for the original first */
 -      if (busy_one != normal) {
 -              if (!normal_busy(bcp))
 -                      selected = normal;
 -      }
 -      if (selected < 0) {
 -              /* can't use the normal, select an alternate */
 -              mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
 -              descriptor_status = read_lmmr(mmr_offset);
 -
 -              /* scan available descriptors 32-63 */
 -              for (i = 0; i < UV_CPUS_PER_AS; i++) {
 -                      if ((hmaster->inuse_map & (1 << i)) == 0) {
 -                              status = ((descriptor_status >>
 -                                              (i * UV_ACT_STATUS_SIZE)) &
 -                                              UV_ACT_STATUS_MASK) << 1;
 -                              if (status != UV2H_DESC_BUSY) {
 -                                      selected = i + UV_CPUS_PER_AS;
 -                                      break;
 -                              }
 -                      }
 -              }
 -      }
 -
 -      if (busy_one != normal)
 -              /* mark the busy alternate as not in-use */
 -              hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
 -
 -      if (selected >= 0) {
 -              /* switch to the selected descriptor */
 -              if (selected != normal) {
 -                      /* set the selected alternate as in-use */
 -                      hmaster->inuse_map |=
 -                                      (1 << (selected - UV_CPUS_PER_AS));
 -                      if (selected > stat->s_uv2_wars_hw)
 -                              stat->s_uv2_wars_hw = selected;
 -              }
 -              bau_desc_old = bcp->descriptor_base;
 -              bau_desc_old += (ITEMS_PER_DESC * busy_one);
 -              bcp->using_desc = selected;
 -              bau_desc_new = bcp->descriptor_base;
 -              bau_desc_new += (ITEMS_PER_DESC * selected);
 -              *bau_desc_new = *bau_desc_old;
 -      } else {
 -              /*
 -               * All are busy. Wait for the normal one for this cpu to
 -               * free up.
 -               */
 -              stat->s_uv2_war_waits++;
 -              spin_unlock(&hmaster->uvhub_lock);
 -              ttm = get_cycles();
 -              do {
 -                      cpu_relax();
 -              } while (normal_busy(bcp));
 -              spin_lock(&hmaster->uvhub_lock);
 -              /* switch to the original descriptor */
 -              bcp->using_desc = normal;
 -              bau_desc_old = bcp->descriptor_base;
 -              bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
 -              bcp->using_desc = (ITEMS_PER_DESC * normal);
 -              bau_desc_new = bcp->descriptor_base;
 -              bau_desc_new += (ITEMS_PER_DESC * normal);
 -              *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
 -      }
 -      spin_unlock(&hmaster->uvhub_lock);
 -      return FLUSH_RETRY_BUSYBUG;
 +      bcp->busy = 1;
 +      return FLUSH_GIVEUP;
  }
  
  static int uv2_wait_completion(struct bau_desc *bau_desc,
  {
        unsigned long descriptor_stat;
        cycles_t ttm;
 -      int desc = bcp->using_desc;
 +      int desc = bcp->uvhub_cpu;
        long busy_reps = 0;
        struct ptc_stats *stat = bcp->statp;
  
  
        /* spin on the status MMR, waiting for it to go idle */
        while (descriptor_stat != UV2H_DESC_IDLE) {
 -              /*
 -               * Our software ack messages may be blocked because
 -               * there are no swack resources available.  As long
 -               * as none of them has timed out hardware will NACK
 -               * our message and its state will stay IDLE.
 -               */
 -              if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
 -                  (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
 +              if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) {
 +                      /*
 +                       * A h/w bug on the destination side may
 +                       * have prevented the message being marked
 +                       * pending, thus it doesn't get replied to
 +                       * and gets continually nacked until it times
 +                       * out with a SOURCE_TIMEOUT.
 +                       */
                        stat->s_stimeout++;
                        return FLUSH_GIVEUP;
 -              } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) {
 -                      stat->s_strongnacks++;
 -                      bcp->conseccompletes = 0;
 -                      return FLUSH_GIVEUP;
                } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
 +                      ttm = get_cycles();
 +
 +                      /*
 +                       * Our retries may be blocked by all destination
 +                       * swack resources being consumed, and a timeout
 +                       * pending.  In that case hardware returns the
 +                       * ERROR that looks like a destination timeout.
 +                       * Without using the extended status we have to
 +                       * deduce from the short time that this was a
 +                       * strong nack.
 +                       */
 +                      if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
 +                              bcp->conseccompletes = 0;
 +                              stat->s_plugged++;
 +                              /* FLUSH_RETRY_PLUGGED causes hang on boot */
 +                              return FLUSH_GIVEUP;
 +                      }
                        stat->s_dtimeout++;
                        bcp->conseccompletes = 0;
 -                      return FLUSH_RETRY_TIMEOUT;
 +                      /* FLUSH_RETRY_TIMEOUT causes hang on boot */
 +                      return FLUSH_GIVEUP;
                } else {
                        busy_reps++;
                        if (busy_reps > 1000000) {
                                busy_reps = 0;
                                ttm = get_cycles();
                                if ((ttm - bcp->send_message) >
 -                                      (bcp->clocks_per_100_usec)) {
 +                                              bcp->timeout_interval)
                                        return handle_uv2_busy(bcp);
 -                              }
                        }
                        /*
                         * descriptor_stat is still BUSY
@@@ -650,7 -679,7 +650,7 @@@ static int wait_completion(struct bau_d
  {
        int right_shift;
        unsigned long mmr_offset;
 -      int desc = bcp->using_desc;
 +      int desc = bcp->uvhub_cpu;
  
        if (desc < UV_CPUS_PER_AS) {
                mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@@ -729,31 -758,33 +729,31 @@@ static void destination_timeout(struct 
  }
  
  /*
 - * Completions are taking a very long time due to a congested numalink
 - * network.
 + * Stop all cpus on a uvhub from using the BAU for a period of time.
 + * This is reversed by check_enable.
   */
 -static void disable_for_congestion(struct bau_control *bcp,
 -                                      struct ptc_stats *stat)
 +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat)
  {
 -      /* let only one cpu do this disabling */
 -      spin_lock(&disable_lock);
 -
 -      if (!baudisabled && bcp->period_requests &&
 -          ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
 -              int tcpu;
 -              struct bau_control *tbcp;
 -              /* it becomes this cpu's job to turn on the use of the
 -                 BAU again */
 -              baudisabled = 1;
 -              bcp->set_bau_off = 1;
 -              bcp->set_bau_on_time = get_cycles();
 -              bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
 +      int tcpu;
 +      struct bau_control *tbcp;
 +      struct bau_control *hmaster;
 +      cycles_t tm1;
 +
 +      hmaster = bcp->uvhub_master;
 +      spin_lock(&hmaster->disable_lock);
 +      if (!bcp->baudisabled) {
                stat->s_bau_disabled++;
 +              tm1 = get_cycles();
                for_each_present_cpu(tcpu) {
                        tbcp = &per_cpu(bau_control, tcpu);
 -                      tbcp->baudisabled = 1;
 +                      if (tbcp->uvhub_master == hmaster) {
 +                              tbcp->baudisabled = 1;
 +                              tbcp->set_bau_on_time =
 +                                      tm1 + bcp->disabled_period;
 +                      }
                }
        }
 -
 -      spin_unlock(&disable_lock);
 +      spin_unlock(&hmaster->disable_lock);
  }
  
  static void count_max_concurr(int stat, struct bau_control *bcp,
@@@ -784,30 -815,16 +784,30 @@@ static void record_send_stats(cycles_t 
                        bcp->period_requests++;
                        bcp->period_time += elapsed;
                        if ((elapsed > congested_cycles) &&
 -                          (bcp->period_requests > bcp->cong_reps))
 -                              disable_for_congestion(bcp, stat);
 +                          (bcp->period_requests > bcp->cong_reps) &&
 +                          ((bcp->period_time / bcp->period_requests) >
 +                                                      congested_cycles)) {
 +                              stat->s_congested++;
 +                              disable_for_period(bcp, stat);
 +                      }
                }
        } else
                stat->s_requestor--;
  
        if (completion_status == FLUSH_COMPLETE && try > 1)
                stat->s_retriesok++;
 -      else if (completion_status == FLUSH_GIVEUP)
 +      else if (completion_status == FLUSH_GIVEUP) {
                stat->s_giveup++;
 +              if (get_cycles() > bcp->period_end)
 +                      bcp->period_giveups = 0;
 +              bcp->period_giveups++;
 +              if (bcp->period_giveups == 1)
 +                      bcp->period_end = get_cycles() + bcp->disabled_period;
 +              if (bcp->period_giveups > bcp->giveup_limit) {
 +                      disable_for_period(bcp, stat);
 +                      stat->s_giveuplimit++;
 +              }
 +      }
  }
  
  /*
@@@ -851,8 -868,7 +851,8 @@@ static void handle_cmplt(int completion
   * Returns 1 if it gives up entirely and the original cpu mask is to be
   * returned to the kernel.
   */
 -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
 +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp,
 +      struct bau_desc *bau_desc)
  {
        int seq_number = 0;
        int completion_stat = 0;
        struct bau_control *hmaster = bcp->uvhub_master;
        struct uv1_bau_msg_header *uv1_hdr = NULL;
        struct uv2_bau_msg_header *uv2_hdr = NULL;
 -      struct bau_desc *bau_desc;
  
 -      if (bcp->uvhub_version == 1)
 +      if (bcp->uvhub_version == 1) {
 +              uv1 = 1;
                uv1_throttle(hmaster, stat);
 +      }
  
        while (hmaster->uvhub_quiesce)
                cpu_relax();
  
        time1 = get_cycles();
 +      if (uv1)
 +              uv1_hdr = &bau_desc->header.uv1_hdr;
 +      else
 +              uv2_hdr = &bau_desc->header.uv2_hdr;
 +
        do {
 -              bau_desc = bcp->descriptor_base;
 -              bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
 -              if (bcp->uvhub_version == 1) {
 -                      uv1 = 1;
 -                      uv1_hdr = &bau_desc->header.uv1_hdr;
 -              } else
 -                      uv2_hdr = &bau_desc->header.uv2_hdr;
 -              if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
 +              if (try == 0) {
                        if (uv1)
                                uv1_hdr->msg_type = MSG_REGULAR;
                        else
                        uv1_hdr->sequence = seq_number;
                else
                        uv2_hdr->sequence = seq_number;
 -              index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
 +              index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
                bcp->send_message = get_cycles();
  
                write_mmr_activation(index);
  
                try++;
                completion_stat = wait_completion(bau_desc, bcp, try);
 -              /* UV2: wait_completion() may change the bcp->using_desc */
  
                handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
  
                if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
                        bcp->ipi_attempts = 0;
 +                      stat->s_overipilimit++;
                        completion_stat = FLUSH_GIVEUP;
                        break;
                }
                cpu_relax();
        } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
 -               (completion_stat == FLUSH_RETRY_BUSYBUG) ||
                 (completion_stat == FLUSH_RETRY_TIMEOUT));
  
        time2 = get_cycles();
  }
  
  /*
 - * The BAU is disabled. When the disabled time period has expired, the cpu
 - * that disabled it must re-enable it.
 - * Return 0 if it is re-enabled for all cpus.
 + * The BAU is disabled for this uvhub. When the disabled time period has
 + * expired re-enable it.
 + * Return 0 if it is re-enabled for all cpus on this uvhub.
   */
  static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
  {
        int tcpu;
        struct bau_control *tbcp;
 +      struct bau_control *hmaster;
  
 -      if (bcp->set_bau_off) {
 -              if (get_cycles() >= bcp->set_bau_on_time) {
 -                      stat->s_bau_reenabled++;
 -                      baudisabled = 0;
 -                      for_each_present_cpu(tcpu) {
 -                              tbcp = &per_cpu(bau_control, tcpu);
 +      hmaster = bcp->uvhub_master;
 +      spin_lock(&hmaster->disable_lock);
 +      if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) {
 +              stat->s_bau_reenabled++;
 +              for_each_present_cpu(tcpu) {
 +                      tbcp = &per_cpu(bau_control, tcpu);
 +                      if (tbcp->uvhub_master == hmaster) {
                                tbcp->baudisabled = 0;
                                tbcp->period_requests = 0;
                                tbcp->period_time = 0;
 +                              tbcp->period_giveups = 0;
                        }
 -                      return 0;
                }
 +              spin_unlock(&hmaster->disable_lock);
 +              return 0;
        }
 +      spin_unlock(&hmaster->disable_lock);
        return -1;
  }
  
@@@ -1055,8 -1068,8 +1055,8 @@@ static int set_distrib_bits(struct cpum
   * done.  The returned pointer is valid till preemption is re-enabled.
   */
  const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-                               struct mm_struct *mm, unsigned long va,
-                               unsigned int cpu)
+                               struct mm_struct *mm, unsigned long start,
+                               unsigned end, unsigned int cpu)
  {
        int locals = 0;
        int remotes = 0;
        struct cpumask *flush_mask;
        struct ptc_stats *stat;
        struct bau_control *bcp;
 -
 -      /* kernel was booted 'nobau' */
 -      if (nobau)
 -              return cpumask;
 +      unsigned long descriptor_status;
 +      unsigned long status;
  
        bcp = &per_cpu(bau_control, cpu);
        stat = bcp->statp;
 +      stat->s_enters++;
 +
 +      if (bcp->nobau)
 +              return cpumask;
 +
 +      if (bcp->busy) {
 +              descriptor_status =
 +                      read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0);
 +              status = ((descriptor_status >> (bcp->uvhub_cpu *
 +                      UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1;
 +              if (status == UV2H_DESC_BUSY)
 +                      return cpumask;
 +              bcp->busy = 0;
 +      }
  
        /* bau was disabled due to slow response */
        if (bcp->baudisabled) {
 -              if (check_enable(bcp, stat))
 +              if (check_enable(bcp, stat)) {
 +                      stat->s_ipifordisabled++;
                        return cpumask;
 +              }
        }
  
        /*
                stat->s_ntargself++;
  
        bau_desc = bcp->descriptor_base;
 -      bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
 +      bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu);
        bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
        if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
                return NULL;
  
        record_send_statistics(stat, locals, hubs, remotes, bau_desc);
  
-       bau_desc->payload.address = va;
+       bau_desc->payload.address = start;
        bau_desc->payload.sending_cpu = cpu;
        /*
         * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
         * or 1 if it gave up and the original cpumask should be returned.
         */
 -      if (!uv_flush_send_and_wait(flush_mask, bcp))
 +      if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc))
                return NULL;
        else
                return cpumask;
  }
  
  /*
 - * Search the message queue for any 'other' message with the same software
 - * acknowledge resource bit vector.
 + * Search the message queue for any 'other' unprocessed message with the
 + * same software acknowledge resource bit vector as the 'msg' message.
   */
  struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
 -                      struct bau_control *bcp, unsigned char swack_vec)
 +                                         struct bau_control *bcp)
  {
        struct bau_pq_entry *msg_next = msg + 1;
 +      unsigned char swack_vec = msg->swack_vec;
  
        if (msg_next > bcp->queue_last)
                msg_next = bcp->queue_first;
 -      while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
 -              if (msg_next->swack_vec == swack_vec)
 +      while (msg_next != msg) {
 +              if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) &&
 +                              (msg_next->swack_vec == swack_vec))
                        return msg_next;
                msg_next++;
                if (msg_next > bcp->queue_last)
@@@ -1168,30 -1165,32 +1168,30 @@@ void process_uv2_message(struct msg_des
                 * This message was assigned a swack resource, but no
                 * reserved acknowlegment is pending.
                 * The bug has prevented this message from setting the MMR.
 -               * And no other message has used the same sw_ack resource.
 -               * Do the requested shootdown but do not reply to the msg.
 -               * (the 0 means make no acknowledge)
                 */
 -              bau_process_message(mdp, bcp, 0);
 -              return;
 -      }
 -
 -      /*
 -       * Some message has set the MMR 'pending' bit; it might have been
 -       * another message.  Look for that message.
 -       */
 -      other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
 -      if (other_msg) {
 -              /* There is another.  Do not ack the current one. */
 -              bau_process_message(mdp, bcp, 0);
                /*
 -               * Let the natural processing of that message acknowledge
 -               * it. Don't get the processing of sw_ack's out of order.
 +               * Some message has set the MMR 'pending' bit; it might have
 +               * been another message.  Look for that message.
                 */
 -              return;
 +              other_msg = find_another_by_swack(msg, bcp);
 +              if (other_msg) {
 +                      /*
 +                       * There is another. Process this one but do not
 +                       * ack it.
 +                       */
 +                      bau_process_message(mdp, bcp, 0);
 +                      /*
 +                       * Let the natural processing of that other message
 +                       * acknowledge it. Don't get the processing of sw_ack's
 +                       * out of order.
 +                       */
 +                      return;
 +              }
        }
  
        /*
 -       * There is no other message using this sw_ack, so it is safe to
 -       * acknowledge it.
 +       * Either the MMR shows this one pending a reply or there is no
 +       * other message using this sw_ack, so it is safe to acknowledge it.
         */
        bau_process_message(mdp, bcp, 1);
  
@@@ -1296,8 -1295,7 +1296,8 @@@ static void __init enable_timeouts(void
                 */
                mmr_image |= (1L << SOFTACK_MSHIFT);
                if (is_uv2_hub()) {
 -                      mmr_image |= (1L << UV2_EXT_SHFT);
 +                      /* hw bug workaround; do not use extended status */
 +                      mmr_image &= ~(1L << UV2_EXT_SHFT);
                }
                write_mmr_misc_control(pnode, mmr_image);
        }
@@@ -1340,34 -1338,29 +1340,34 @@@ static inline unsigned long long usec_2
  static int ptc_seq_show(struct seq_file *file, void *data)
  {
        struct ptc_stats *stat;
 +      struct bau_control *bcp;
        int cpu;
  
        cpu = *(loff_t *)data;
        if (!cpu) {
                seq_printf(file,
 -                      "# cpu sent stime self locals remotes ncpus localhub ");
 +               "# cpu bauoff sent stime self locals remotes ncpus localhub ");
                seq_printf(file,
                        "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
                seq_printf(file,
 -                  "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok ");
 +                      "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries ");
 +              seq_printf(file,
 +                      "rok resetp resett giveup sto bz throt disable ");
                seq_printf(file,
 -                      "resetp resett giveup sto bz throt swack recv rtime ");
 +                      "enable wars warshw warwaits enters ipidis plugged ");
                seq_printf(file,
 -                      "all one mult none retry canc nocan reset rcan ");
 +                      "ipiover glim cong swack recv rtime all one mult ");
                seq_printf(file,
 -                      "disable enable wars warshw warwaits\n");
 +                      "none retry canc nocan reset rcan\n");
        }
        if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 -              stat = &per_cpu(ptcstats, cpu);
 +              bcp = &per_cpu(bau_control, cpu);
 +              stat = bcp->statp;
                /* source side statistics */
                seq_printf(file,
 -                      "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 -                         cpu, stat->s_requestor, cycles_2_us(stat->s_time),
 +                      "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 +                         cpu, bcp->nobau, stat->s_requestor,
 +                         cycles_2_us(stat->s_time),
                           stat->s_ntargself, stat->s_ntarglocals,
                           stat->s_ntargremotes, stat->s_ntargcpu,
                           stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
                           stat->s_resets_plug, stat->s_resets_timeout,
                           stat->s_giveup, stat->s_stimeout,
                           stat->s_busy, stat->s_throttles);
 +              seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 +                         stat->s_bau_disabled, stat->s_bau_reenabled,
 +                         stat->s_uv2_wars, stat->s_uv2_wars_hw,
 +                         stat->s_uv2_war_waits, stat->s_enters,
 +                         stat->s_ipifordisabled, stat->s_plugged,
 +                         stat->s_overipilimit, stat->s_giveuplimit,
 +                         stat->s_congested);
  
                /* destination side statistics */
                seq_printf(file,
 -                         "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
 +                      "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
                           read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
                           stat->d_requestee, cycles_2_us(stat->d_time),
                           stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
                           stat->d_nomsg, stat->d_retries, stat->d_canceled,
                           stat->d_nocanceled, stat->d_resets,
                           stat->d_rcanceled);
 -              seq_printf(file, "%ld %ld %ld %ld %ld\n",
 -                      stat->s_bau_disabled, stat->s_bau_reenabled,
 -                      stat->s_uv2_wars, stat->s_uv2_wars_hw,
 -                      stat->s_uv2_war_waits);
        }
        return 0;
  }
@@@ -1411,14 -1401,13 +1411,14 @@@ static ssize_t tunables_read(struct fil
        char *buf;
        int ret;
  
 -      buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
 -              "max_concur plugged_delay plugsb4reset",
 -              "timeoutsb4reset ipi_reset_limit complete_threshold",
 -              "congested_response_us congested_reps congested_period",
 +      buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n",
 +              "max_concur plugged_delay plugsb4reset timeoutsb4reset",
 +              "ipi_reset_limit complete_threshold congested_response_us",
 +              "congested_reps disabled_period giveup_limit",
                max_concurr, plugged_delay, plugsb4reset,
                timeoutsb4reset, ipi_reset_limit, complete_threshold,
 -              congested_respns_us, congested_reps, congested_period);
 +              congested_respns_us, congested_reps, disabled_period,
 +              giveup_limit);
  
        if (!buf)
                return -ENOMEM;
@@@ -1449,14 -1438,6 +1449,14 @@@ static ssize_t ptc_proc_write(struct fi
                return -EFAULT;
        optstr[count - 1] = '\0';
  
 +      if (!strcmp(optstr, "on")) {
 +              set_bau_on();
 +              return count;
 +      } else if (!strcmp(optstr, "off")) {
 +              set_bau_off();
 +              return count;
 +      }
 +
        if (strict_strtol(optstr, 10, &input_arg) < 0) {
                printk(KERN_DEBUG "%s is invalid\n", optstr);
                return -EINVAL;
@@@ -1589,8 -1570,7 +1589,8 @@@ static ssize_t tunables_write(struct fi
                bcp->complete_threshold =       complete_threshold;
                bcp->cong_response_us =         congested_respns_us;
                bcp->cong_reps =                congested_reps;
 -              bcp->cong_period =              congested_period;
 +              bcp->disabled_period =          sec_2_cycles(disabled_period);
 +              bcp->giveup_limit =             giveup_limit;
        }
        return count;
  }
@@@ -1719,10 -1699,6 +1719,10 @@@ static void activation_descriptor_init(
                         *   fairness chaining multilevel count replied_to
                         */
                } else {
 +                      /*
 +                       * BIOS uses legacy mode, but UV2 hardware always
 +                       * uses native mode for selective broadcasts.
 +                       */
                        uv2_hdr = &bd2->header.uv2_hdr;
                        uv2_hdr->swack_flag =   1;
                        uv2_hdr->base_dest_nasid =
@@@ -1835,8 -1811,8 +1835,8 @@@ static int calculate_destination_timeou
                index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
                mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
                mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
 -              base = timeout_base_ns[index];
 -              ts_ns = base * mult1 * mult2;
 +              ts_ns = timeout_base_ns[index];
 +              ts_ns *= (mult1 * mult2);
                ret = ts_ns / 1000;
        } else {
                /* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */
@@@ -1860,8 -1836,6 +1860,8 @@@ static void __init init_per_cpu_tunable
        for_each_present_cpu(cpu) {
                bcp = &per_cpu(bau_control, cpu);
                bcp->baudisabled                = 0;
 +              if (nobau)
 +                      bcp->nobau              = 1;
                bcp->statp                      = &per_cpu(ptcstats, cpu);
                /* time interval to catch a hardware stay-busy bug */
                bcp->timeout_interval           = usec_2_cycles(2*timeout_us);
                bcp->complete_threshold         = complete_threshold;
                bcp->cong_response_us           = congested_respns_us;
                bcp->cong_reps                  = congested_reps;
 -              bcp->cong_period                = congested_period;
 -              bcp->clocks_per_100_usec =      usec_2_cycles(100);
 +              bcp->disabled_period =          sec_2_cycles(disabled_period);
 +              bcp->giveup_limit =             giveup_limit;
                spin_lock_init(&bcp->queue_lock);
                spin_lock_init(&bcp->uvhub_lock);
 +              spin_lock_init(&bcp->disable_lock);
        }
  }
  
@@@ -1999,6 -1972,7 +1999,6 @@@ static int scan_sock(struct socket_des
                }
                bcp->uvhub_master = *hmasterp;
                bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
 -              bcp->using_desc = bcp->uvhub_cpu;
                if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
                        printk(KERN_EMERG "%d cpus per uvhub invalid\n",
                                bcp->uvhub_cpu);
@@@ -2095,12 -2069,16 +2095,12 @@@ static int __init uv_bau_init(void
        if (!is_uv_system())
                return 0;
  
 -      if (nobau)
 -              return 0;
 -
        for_each_possible_cpu(cur_cpu) {
                mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
                zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
        }
  
        nuvhubs = uv_num_possible_blades();
 -      spin_lock_init(&disable_lock);
        congested_cycles = usec_2_cycles(congested_respns_us);
  
        uv_base_pnode = 0x7fffffff;
        enable_timeouts();
  
        if (init_per_cpu(nuvhubs, uv_base_pnode)) {
 -              nobau = 1;
 +              set_bau_off();
 +              nobau_perm = 1;
                return 0;
        }
  
diff --combined arch/x86/xen/mmu.c
index 27336dfcda8ef41e2d8874f20634d105a4084c97,39ed56789f680698fadd07a4416dc4497533ccd8..b65a76133f4f9b4f51dc426021975d7a5427191e
@@@ -308,20 -308,8 +308,20 @@@ static bool xen_batched_set_pte(pte_t *
  
  static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
  {
 -      if (!xen_batched_set_pte(ptep, pteval))
 -              native_set_pte(ptep, pteval);
 +      if (!xen_batched_set_pte(ptep, pteval)) {
 +              /*
 +               * Could call native_set_pte() here and trap and
 +               * emulate the PTE write but with 32-bit guests this
 +               * needs two traps (one for each of the two 32-bit
 +               * words in the PTE) so do one hypercall directly
 +               * instead.
 +               */
 +              struct mmu_update u;
 +
 +              u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
 +              u.val = pte_val_ma(pteval);
 +              HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
 +      }
  }
  
  static void xen_set_pte(pte_t *ptep, pte_t pteval)
@@@ -1256,7 -1244,8 +1256,8 @@@ static void xen_flush_tlb_single(unsign
  }
  
  static void xen_flush_tlb_others(const struct cpumask *cpus,
-                                struct mm_struct *mm, unsigned long va)
+                                struct mm_struct *mm, unsigned long start,
+                                unsigned long end)
  {
        struct {
                struct mmuext_op op;
        } *args;
        struct multicall_space mcs;
  
-       trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+       trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
  
        if (cpumask_empty(cpus))
                return;         /* nothing to do */
        cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
        cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
  
-       if (va == TLB_FLUSH_ALL) {
-               args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-       } else {
+       args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+       if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
                args->op.cmd = MMUEXT_INVLPG_MULTI;
-               args->op.arg1.linear_addr = va;
+               args->op.arg1.linear_addr = start;
        }
  
        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@@ -1428,28 -1416,13 +1428,28 @@@ static pte_t __init mask_rw_pte(pte_t *
  }
  #endif /* CONFIG_X86_64 */
  
 -/* Init-time set_pte while constructing initial pagetables, which
 -   doesn't allow RO pagetable pages to be remapped RW */
 +/*
 + * Init-time set_pte while constructing initial pagetables, which
 + * doesn't allow RO page table pages to be remapped RW.
 + *
 + * If there is no MFN for this PFN then this page is initially
 + * ballooned out so clear the PTE (as in decrease_reservation() in
 + * drivers/xen/balloon.c).
 + *
 + * Many of these PTE updates are done on unpinned and writable pages
 + * and doing a hypercall for these is unnecessary and expensive.  At
 + * this point it is not possible to tell if a page is pinned or not,
 + * so always write the PTE directly and rely on Xen trapping and
 + * emulating any updates as necessary.
 + */
  static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
  {
 -      pte = mask_rw_pte(ptep, pte);
 +      if (pte_mfn(pte) != INVALID_P2M_ENTRY)
 +              pte = mask_rw_pte(ptep, pte);
 +      else
 +              pte = __pte_ma(0);
  
 -      xen_set_pte(ptep, pte);
 +      native_set_pte(ptep, pte);
  }
  
  static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
diff --combined mm/memory.c
index 2466d1250231f3e2405429ea4de4a97c597d0074,32c99433cfdf7e9c2c5eb9d6ec9ac27350470e16..91f69459d3e8b3bf8075574e145788b0b40c4a6f
@@@ -206,6 -206,8 +206,8 @@@ void tlb_gather_mmu(struct mmu_gather *
        tlb->mm = mm;
  
        tlb->fullmm     = fullmm;
+       tlb->start      = -1UL;
+       tlb->end        = 0;
        tlb->need_flush = 0;
        tlb->fast_mode  = (num_possible_cpus() == 1);
        tlb->local.next = NULL;
@@@ -248,6 -250,8 +250,8 @@@ void tlb_finish_mmu(struct mmu_gather *
  {
        struct mmu_gather_batch *batch, *next;
  
+       tlb->start = start;
+       tlb->end   = end;
        tlb_flush_mmu(tlb);
  
        /* keep the page table cache within bounds */
@@@ -1204,6 -1208,11 +1208,11 @@@ again
         */
        if (force_flush) {
                force_flush = 0;
+ #ifdef HAVE_GENERIC_MMU_GATHER
+               tlb->start = addr;
+               tlb->end = end;
+ #endif
                tlb_flush_mmu(tlb);
                if (addr != end)
                        goto again;
@@@ -1225,15 -1234,7 +1234,15 @@@ static inline unsigned long zap_pmd_ran
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
 -                              VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
 +#ifdef CONFIG_DEBUG_VM
 +                              if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
 +                                      pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
 +                                              __func__, addr, end,
 +                                              vma->vm_start,
 +                                              vma->vm_end);
 +                                      BUG();
 +                              }
 +#endif
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
@@@ -1374,7 -1375,7 +1383,7 @@@ void unmap_vmas(struct mmu_gather *tlb
  /**
   * zap_page_range - remove user pages in a given range
   * @vma: vm_area_struct holding the applicable pages
 - * @address: starting address of pages to zap
 + * @start: starting address of pages to zap
   * @size: number of bytes to zap
   * @details: details of nonlinear truncation or shared cache invalidation
   *