Merge tag 'for-v3.11' of git://git.infradead.org/battery-2.6

[~andy/linux] / arch / x86 / kvm / mmu.c
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 417f36b7c0e4f2e89d42512f4489dcd5511c37cb..0d094da49541d171e7218c7340e63dce6d35674c 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -235,19 +235,22 @@ static unsigned int get_mmio_spte_generation(u64 spte)
  
  static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
  {
-       return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+       /*
+        * Init kvm generation close to MMIO_MAX_GEN to easily test the
+        * code of handling generation number wrap-around.
+        */
+       return (kvm_memslots(kvm)->generation +
+                     MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
  }
  
  static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
                            unsigned access)
  {
-       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
         unsigned int gen = kvm_current_mmio_generation(kvm);
         u64 mask = generation_mmio_spte_mask(gen);
  
         access &= ACC_WRITE_MASK | ACC_USER_MASK;
         mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
-       sp->mmio_cached = true;
  
         trace_mark_mmio_spte(sptep, gfn, access, gen);
         mmu_spte_set(sptep, mask);
@@ -463,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
  /*
   * The idea using the light way get the spte on x86_32 guest is from
   * gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock.  Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte.  The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race.  This is done using clear_spte_count.
   */
  static u64 __get_spte_lockless(u64 *sptep)
  {
@@ -4359,24 +4373,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
         spin_unlock(&kvm->mmu_lock);
  }
  
-static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
-{
-       struct kvm_mmu_page *sp, *node;
-       LIST_HEAD(invalid_list);
-
-       spin_lock(&kvm->mmu_lock);
-restart:
-       list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-               if (!sp->mmio_cached)
-                       continue;
-               if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-                       goto restart;
-       }
-
-       kvm_mmu_commit_zap_page(kvm, &invalid_list);
-       spin_unlock(&kvm->mmu_lock);
-}
-
  static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
  {
         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
@@ -4391,8 +4387,10 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
          * The max value is MMIO_MAX_GEN - 1 since it is not called
          * when mark memslot invalid.
          */
-       if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1)))
-               kvm_mmu_zap_mmio_sptes(kvm);
+       if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+               printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+               kvm_mmu_invalidate_zap_all_pages(kvm);
+       }
  }
  
  static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)