Merge tag 'for-v3.11' of git://git.infradead.org/battery-2.6

[~andy/linux] / arch / x86 / kvm / mmu.c
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c

index 3fd060af539429a4e68474550cb2480f4d3f98e3..0d094da49541d171e7218c7340e63dce6d35674c 100644 (file)
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,15 +197,63 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
  
-static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+/*
+ * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+ * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+ * number.
+ */
+#define MMIO_SPTE_GEN_LOW_SHIFT                3
+#define MMIO_SPTE_GEN_HIGH_SHIFT       52
+
+#define MMIO_GEN_SHIFT                 19
+#define MMIO_GEN_LOW_SHIFT             9
+#define MMIO_GEN_LOW_MASK              ((1 << MMIO_GEN_LOW_SHIFT) - 1)
+#define MMIO_GEN_MASK                  ((1 << MMIO_GEN_SHIFT) - 1)
+#define MMIO_MAX_GEN                   ((1 << MMIO_GEN_SHIFT) - 1)
+
+static u64 generation_mmio_spte_mask(unsigned int gen)
  {
-       struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+       u64 mask;
+
+       WARN_ON(gen > MMIO_MAX_GEN);
+
+       mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
+       mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+       return mask;
+}
+
+static unsigned int get_mmio_spte_generation(u64 spte)
+{
+       unsigned int gen;
+
+       spte &= ~shadow_mmio_mask;
+
+       gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
+       gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
+       return gen;
+}
+
+static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+{
+       /*
+        * Init kvm generation close to MMIO_MAX_GEN to easily test the
+        * code of handling generation number wrap-around.
+        */
+       return (kvm_memslots(kvm)->generation +
+                     MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
+}
+
+static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+                          unsigned access)
+{
+       unsigned int gen = kvm_current_mmio_generation(kvm);
+       u64 mask = generation_mmio_spte_mask(gen);
  
         access &= ACC_WRITE_MASK | ACC_USER_MASK;
+       mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT;
  
-       sp->mmio_cached = true;
-       trace_mark_mmio_spte(sptep, gfn, access);
-       mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+       trace_mark_mmio_spte(sptep, gfn, access, gen);
+       mmu_spte_set(sptep, mask);
  }
  
  static bool is_mmio_spte(u64 spte)
@@ -215,24 +263,38 @@ static bool is_mmio_spte(u64 spte)
  
  static gfn_t get_mmio_spte_gfn(u64 spte)
  {
-       return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+       u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+       return (spte & ~mask) >> PAGE_SHIFT;
  }
  
  static unsigned get_mmio_spte_access(u64 spte)
  {
-       return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+       u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+       return (spte & ~mask) & ~PAGE_MASK;
  }
  
-static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+                         pfn_t pfn, unsigned access)
  {
         if (unlikely(is_noslot_pfn(pfn))) {
-               mark_mmio_spte(sptep, gfn, access);
+               mark_mmio_spte(kvm, sptep, gfn, access);
                 return true;
         }
  
         return false;
  }
  
+static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+{
+       unsigned int kvm_gen, spte_gen;
+
+       kvm_gen = kvm_current_mmio_generation(kvm);
+       spte_gen = get_mmio_spte_generation(spte);
+
+       trace_check_mmio_spte(spte, kvm_gen, spte_gen);
+       return likely(kvm_gen == spte_gen);
+}
+
  static inline u64 rsvd_bits(int s, int e)
  {
         return ((1ULL << (e - s + 1)) - 1) << s;
@@ -404,9 +466,20 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
  /*
   * The idea using the light way get the spte on x86_32 guest is from
   * gup_get_pte(arch/x86/mm/gup.c).
- * The difference is we can not catch the spte tlb flush if we leave
- * guest mode, so we emulate it by increase clear_spte_count when spte
- * is cleared.
+ *
+ * An spte tlb flush may be pending, because kvm_set_pte_rmapp
+ * coalesces them and we are running out of the MMU lock.  Therefore
+ * we need to protect against in-progress updates of the spte.
+ *
+ * Reading the spte while an update is in progress may get the old value
+ * for the high part of the spte.  The race is fine for a present->non-present
+ * change (because the high part of the spte is ignored for non-present spte),
+ * but for a present->present change we must reread the spte.
+ *
+ * All such changes are done in two steps (present->non-present and
+ * non-present->present), hence it is enough to count the number of
+ * present->non-present updates: if it changed while reading the spte,
+ * we might have hit the race.  This is done using clear_spte_count.
   */
  static u64 __get_spte_lockless(u64 *sptep)
  {
@@ -1654,6 +1727,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
  static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                     struct list_head *invalid_list);
  
+/*
+ * NOTE: we should pay more attention on the zapped-obsolete page
+ * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
+ * since it has been deleted from active_mmu_pages but still can be found
+ * at hast list.
+ *
+ * for_each_gfn_indirect_valid_sp has skipped that kind of page and
+ * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped
+ * all the obsolete pages.
+ */
  #define for_each_gfn_sp(_kvm, _sp, _gfn)                               \
         hlist_for_each_entry(_sp,                                       \
           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -1875,6 +1958,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                 role.quadrant = quadrant;
         }
         for_each_gfn_sp(vcpu->kvm, sp, gfn) {
+               if (is_obsolete_sp(vcpu->kvm, sp))
+                       continue;
+
                 if (!need_sync && sp->unsync)
                         need_sync = true;
  
@@ -2095,7 +2181,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
                 kvm_mod_used_mmu_pages(kvm, -1);
         } else {
                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
-               kvm_reload_remote_mmus(kvm);
+
+               /*
+                * The obsolete pages can not be used on any vcpus.
+                * See the comments in kvm_mmu_invalidate_zap_all_pages().
+                */
+               if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+                       kvm_reload_remote_mmus(kvm);
         }
  
         sp->role.invalid = 1;
@@ -2345,7 +2437,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
         u64 spte;
         int ret = 0;
  
-       if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+       if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
                 return 0;
  
         spte = PT_PRESENT_MASK;
@@ -3165,17 +3257,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
         return spte;
  }
  
-/*
- * If it is a real mmio page fault, return 1 and emulat the instruction
- * directly, return 0 to let CPU fault again on the address, -1 is
- * returned if bug is detected.
- */
  int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
  {
         u64 spte;
  
         if (quickly_check_mmio_pf(vcpu, addr, direct))
-               return 1;
+               return RET_MMIO_PF_EMULATE;
  
         spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
  
@@ -3183,12 +3270,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
                 gfn_t gfn = get_mmio_spte_gfn(spte);
                 unsigned access = get_mmio_spte_access(spte);
  
+               if (!check_mmio_spte(vcpu->kvm, spte))
+                       return RET_MMIO_PF_INVALID;
+
                 if (direct)
                         addr = 0;
  
                 trace_handle_mmio_page_fault(addr, gfn, access);
                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
-               return 1;
+               return RET_MMIO_PF_EMULATE;
         }
  
         /*
@@ -3196,13 +3286,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
          * it's a BUG if the gfn is not a mmio page.
          */
         if (direct && !check_direct_spte_mmio_pf(spte))
-               return -1;
+               return RET_MMIO_PF_BUG;
  
         /*
          * If the page table is zapped by other cpus, let CPU fault again on
          * the address.
          */
-       return 0;
+       return RET_MMIO_PF_RETRY;
  }
  EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
  
@@ -3212,7 +3302,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
         int ret;
  
         ret = handle_mmio_page_fault_common(vcpu, addr, direct);
-       WARN_ON(ret < 0);
+       WARN_ON(ret == RET_MMIO_PF_BUG);
         return ret;
  }
  
@@ -3224,8 +3314,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
  
         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
  
-       if (unlikely(error_code & PFERR_RSVD_MASK))
-               return handle_mmio_page_fault(vcpu, gva, error_code, true);
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+               r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+
+               if (likely(r != RET_MMIO_PF_INVALID))
+                       return r;
+       }
  
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@ -3301,8 +3395,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
         ASSERT(vcpu);
         ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
  
-       if (unlikely(error_code & PFERR_RSVD_MASK))
-               return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+       if (unlikely(error_code & PFERR_RSVD_MASK)) {
+               r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
+               if (likely(r != RET_MMIO_PF_INVALID))
+                       return r;
+       }
  
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@ -3408,8 +3506,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
         *access &= mask;
  }
  
-static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
-                          int *nr_present)
+static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+                          unsigned access, int *nr_present)
  {
         if (unlikely(is_mmio_spte(*sptep))) {
                 if (gfn != get_mmio_spte_gfn(*sptep)) {
@@ -3418,7 +3516,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
                 }
  
                 (*nr_present)++;
-               mark_mmio_spte(sptep, gfn, access);
+               mark_mmio_spte(kvm, sptep, gfn, access);
                 return true;
         }
  
@@ -4194,14 +4292,17 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
         spin_unlock(&kvm->mmu_lock);
  }
  
+#define BATCH_ZAP_PAGES        10
  static void kvm_zap_obsolete_pages(struct kvm *kvm)
  {
         struct kvm_mmu_page *sp, *node;
-       LIST_HEAD(invalid_list);
+       int batch = 0;
  
  restart:
         list_for_each_entry_safe_reverse(sp, node,
               &kvm->arch.active_mmu_pages, link) {
+               int ret;
+
                 /*
                  * No obsolete page exists before new created page since
                  * active_mmu_pages is the FIFO list.
@@ -4210,28 +4311,6 @@ restart:
                         break;
  
                 /*
-                * Do not repeatedly zap a root page to avoid unnecessary
-                * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
-                * progress:
-                *    vcpu 0                        vcpu 1
-                *                         call vcpu_enter_guest():
-                *                            1): handle KVM_REQ_MMU_RELOAD
-                *                                and require mmu-lock to
-                *                                load mmu
-                * repeat:
-                *    1): zap root page and
-                *        send KVM_REQ_MMU_RELOAD
-                *
-                *    2): if (cond_resched_lock(mmu-lock))
-                *
-                *                            2): hold mmu-lock and load mmu
-                *
-                *                            3): see KVM_REQ_MMU_RELOAD bit
-                *                                on vcpu->requests is set
-                *                                then return 1 to call
-                *                                vcpu_enter_guest() again.
-                *            goto repeat;
-                *
                  * Since we are reversely walking the list and the invalid
                  * list will be moved to the head, skip the invalid page
                  * can help us to avoid the infinity list walking.
@@ -4239,17 +4318,29 @@ restart:
                 if (sp->role.invalid)
                         continue;
  
-               if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-                       kvm_mmu_commit_zap_page(kvm, &invalid_list);
-                       cond_resched_lock(&kvm->mmu_lock);
+               /*
+                * Need not flush tlb since we only zap the sp with invalid
+                * generation number.
+                */
+               if (batch >= BATCH_ZAP_PAGES &&
+                     cond_resched_lock(&kvm->mmu_lock)) {
+                       batch = 0;
                         goto restart;
                 }
  
-               if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+               ret = kvm_mmu_prepare_zap_page(kvm, sp,
+                               &kvm->arch.zapped_obsolete_pages);
+               batch += ret;
+
+               if (ret)
                         goto restart;
         }
  
-       kvm_mmu_commit_zap_page(kvm, &invalid_list);
+       /*
+        * Should flush tlb before free page tables since lockless-walking
+        * may use the pages.
+        */
+       kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
  }
  
  /*
@@ -4267,26 +4358,39 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
         trace_kvm_mmu_invalidate_zap_all_pages(kvm);
         kvm->arch.mmu_valid_gen++;
  
+       /*
+        * Notify all vcpus to reload its shadow page table
+        * and flush TLB. Then all vcpus will switch to new
+        * shadow page table with the new mmu_valid_gen.
+        *
+        * Note: we should do this under the protection of
+        * mmu-lock, otherwise, vcpu would purge shadow page
+        * but miss tlb flush.
+        */
+       kvm_reload_remote_mmus(kvm);
+
         kvm_zap_obsolete_pages(kvm);
         spin_unlock(&kvm->mmu_lock);
  }
  
-void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
+static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
  {
-       struct kvm_mmu_page *sp, *node;
-       LIST_HEAD(invalid_list);
+       return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
+}
  
-       spin_lock(&kvm->mmu_lock);
-restart:
-       list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-               if (!sp->mmio_cached)
-                       continue;
-               if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-                       goto restart;
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+{
+       /*
+        * The very rare case: if the generation-number is round,
+        * zap all shadow pages.
+        *
+        * The max value is MMIO_MAX_GEN - 1 since it is not called
+        * when mark memslot invalid.
+        */
+       if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) {
+               printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+               kvm_mmu_invalidate_zap_all_pages(kvm);
         }
-
-       kvm_mmu_commit_zap_page(kvm, &invalid_list);
-       spin_unlock(&kvm->mmu_lock);
  }
  
  static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
@@ -4317,15 +4421,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
                  * want to shrink a VM that only started to populate its MMU
                  * anyway.
                  */
-               if (!kvm->arch.n_used_mmu_pages)
+               if (!kvm->arch.n_used_mmu_pages &&
+                     !kvm_has_zapped_obsolete_pages(kvm))
                         continue;
  
                 idx = srcu_read_lock(&kvm->srcu);
                 spin_lock(&kvm->mmu_lock);
  
+               if (kvm_has_zapped_obsolete_pages(kvm)) {
+                       kvm_mmu_commit_zap_page(kvm,
+                             &kvm->arch.zapped_obsolete_pages);
+                       goto unlock;
+               }
+
                 prepare_zap_oldest_mmu_page(kvm, &invalid_list);
                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
  
+unlock:
                 spin_unlock(&kvm->mmu_lock);
                 srcu_read_unlock(&kvm->srcu, idx);