]> Pileus Git - ~andy/linux/blobdiff - arch/x86/kvm/vmx.c
Merge branch 'kconfig' of git://git.kernel.org/pub/scm/linux/kernel/git/mmarek/kbuild
[~andy/linux] / arch / x86 / kvm / vmx.c
index 064d0be67ecc23734aa465541138d9b5be295277..1f1da43ff2a2ca66a137c434cf738dbf7a03e704 100644 (file)
@@ -373,6 +373,7 @@ struct nested_vmx {
         * we must keep them pinned while L2 runs.
         */
        struct page *apic_access_page;
+       u64 msr_ia32_feature_control;
 };
 
 #define POSTED_INTR_ON  0
@@ -711,10 +712,10 @@ static void nested_release_page_clean(struct page *page)
        kvm_release_page_clean(page);
 }
 
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -1039,12 +1040,16 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
                (vmcs12->secondary_vm_exec_control & bit);
 }
 
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
-       struct kvm_vcpu *vcpu)
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
 {
        return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
+}
+
 static inline bool is_exception(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -2155,6 +2160,7 @@ static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
 static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
 static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
 static u32 nested_vmx_misc_low, nested_vmx_misc_high;
+static u32 nested_vmx_ept_caps;
 static __init void nested_vmx_setup_ctls_msrs(void)
 {
        /*
@@ -2190,14 +2196,17 @@ static __init void nested_vmx_setup_ctls_msrs(void)
         * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and
         * 17 must be 1.
         */
+       rdmsr(MSR_IA32_VMX_EXIT_CTLS,
+               nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high);
        nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
        /* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+       nested_vmx_exit_ctls_high &=
 #ifdef CONFIG_X86_64
-       nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#else
-       nested_vmx_exit_ctls_high = 0;
+               VM_EXIT_HOST_ADDR_SPACE_SIZE |
 #endif
-       nested_vmx_exit_ctls_high |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+               VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+       nested_vmx_exit_ctls_high |= (VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
+                                     VM_EXIT_LOAD_IA32_EFER);
 
        /* entry controls */
        rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
@@ -2205,8 +2214,12 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */
        nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
        nested_vmx_entry_ctls_high &=
-               VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
-       nested_vmx_entry_ctls_high |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+#ifdef CONFIG_X86_64
+               VM_ENTRY_IA32E_MODE |
+#endif
+               VM_ENTRY_LOAD_IA32_PAT;
+       nested_vmx_entry_ctls_high |= (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR |
+                                      VM_ENTRY_LOAD_IA32_EFER);
 
        /* cpu-based controls */
        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
@@ -2241,6 +2254,22 @@ static __init void nested_vmx_setup_ctls_msrs(void)
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_WBINVD_EXITING;
 
+       if (enable_ept) {
+               /* nested EPT: emulate EPT also to L1 */
+               nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+               nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
+                        VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
+               nested_vmx_ept_caps &= vmx_capability.ept;
+               /*
+                * Since invept is completely emulated we support both global
+                * and context invalidation independent of what host cpu
+                * supports
+                */
+               nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
+                       VMX_EPT_EXTENT_CONTEXT_BIT;
+       } else
+               nested_vmx_ept_caps = 0;
+
        /* miscellaneous data */
        rdmsr(MSR_IA32_VMX_MISC, nested_vmx_misc_low, nested_vmx_misc_high);
        nested_vmx_misc_low &= VMX_MISC_PREEMPTION_TIMER_RATE_MASK |
@@ -2282,8 +2311,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 
        switch (msr_index) {
        case MSR_IA32_FEATURE_CONTROL:
-               *pdata = 0;
-               break;
+               if (nested_vmx_allowed(vcpu)) {
+                       *pdata = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+                       break;
+               }
+               return 0;
        case MSR_IA32_VMX_BASIC:
                /*
                 * This MSR reports some information about VMX support. We
@@ -2346,8 +2378,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                                        nested_vmx_secondary_ctls_high);
                break;
        case MSR_IA32_VMX_EPT_VPID_CAP:
-               /* Currently, no nested ept or nested vpid */
-               *pdata = 0;
+               /* Currently, no nested vpid support */
+               *pdata = nested_vmx_ept_caps;
                break;
        default:
                return 0;
@@ -2356,14 +2388,24 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
        return 1;
 }
 
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+       u32 msr_index = msr_info->index;
+       u64 data = msr_info->data;
+       bool host_initialized = msr_info->host_initiated;
+
        if (!nested_vmx_allowed(vcpu))
                return 0;
 
-       if (msr_index == MSR_IA32_FEATURE_CONTROL)
-               /* TODO: the right thing. */
+       if (msr_index == MSR_IA32_FEATURE_CONTROL) {
+               if (!host_initialized &&
+                               to_vmx(vcpu)->nested.msr_ia32_feature_control
+                               & FEATURE_CONTROL_LOCKED)
+                       return 0;
+               to_vmx(vcpu)->nested.msr_ia32_feature_control = data;
                return 1;
+       }
+
        /*
         * No need to treat VMX capability MSRs specially: If we don't handle
         * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2536,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        return 1;
                /* Otherwise falls through */
        default:
-               if (vmx_set_vmx_msr(vcpu, msr_index, data))
+               if (vmx_set_vmx_msr(vcpu, msr_info))
                        break;
                msr = find_msr_entry(vmx, msr_index);
                if (msr) {
@@ -5302,9 +5344,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
        /* It is a write fault? */
        error_code = exit_qualification & (1U << 1);
+       /* It is a fetch fault? */
+       error_code |= (exit_qualification & (1U << 2)) << 2;
        /* ept page table is present? */
        error_code |= (exit_qualification >> 3) & 0x1;
 
+       vcpu->arch.exit_qualification = exit_qualification;
+
        return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
@@ -5438,7 +5484,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 
                err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
 
-               if (err == EMULATE_DO_MMIO) {
+               if (err == EMULATE_USER_EXIT) {
+                       ++vcpu->stat.mmio_exits;
                        ret = 0;
                        goto out;
                }
@@ -5567,8 +5614,47 @@ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
                free_loaded_vmcs(&vmx->vmcs01);
 }
 
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+       vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                           X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+                           X86_EFLAGS_SF | X86_EFLAGS_OF))
+                       | X86_EFLAGS_CF);
+}
+
 static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                u32 vm_instruction_error);
+                                       u32 vm_instruction_error)
+{
+       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+               /*
+                * failValid writes the error number to the current VMCS, which
+                * can't be done there isn't a current VMCS.
+                */
+               nested_vmx_failInvalid(vcpu);
+               return;
+       }
+       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+                           X86_EFLAGS_SF | X86_EFLAGS_OF))
+                       | X86_EFLAGS_ZF);
+       get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+       /*
+        * We don't need to force a shadow sync because
+        * VM_INSTRUCTION_ERROR is not shadowed
+        */
+}
 
 /*
  * Emulate the VMXON instruction.
@@ -5583,6 +5669,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        struct kvm_segment cs;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs *shadow_vmcs;
+       const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+               | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
        /* The Intel VMX Instruction Reference lists a bunch of bits that
         * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5611,6 +5699,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
                skip_emulated_instruction(vcpu);
                return 1;
        }
+
+       if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+                       != VMXON_NEEDED_FEATURES) {
+               kvm_inject_gp(vcpu, 0);
+               return 1;
+       }
+
        if (enable_shadow_vmcs) {
                shadow_vmcs = alloc_vmcs();
                if (!shadow_vmcs)
@@ -5628,6 +5723,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
        vmx->nested.vmxon = true;
 
        skip_emulated_instruction(vcpu);
+       nested_vmx_succeed(vcpu);
        return 1;
 }
 
@@ -5712,6 +5808,7 @@ static int handle_vmoff(struct kvm_vcpu *vcpu)
                return 1;
        free_nested(to_vmx(vcpu));
        skip_emulated_instruction(vcpu);
+       nested_vmx_succeed(vcpu);
        return 1;
 }
 
@@ -5768,48 +5865,6 @@ static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
        return 0;
 }
 
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
- * by Vol 2B, VMX Instruction Reference, "Conventions".
- */
-static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
-{
-       vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
-                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                           X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
-}
-
-static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
-{
-       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                       & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
-                           X86_EFLAGS_SF | X86_EFLAGS_OF))
-                       | X86_EFLAGS_CF);
-}
-
-static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
-                                       u32 vm_instruction_error)
-{
-       if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
-               /*
-                * failValid writes the error number to the current VMCS, which
-                * can't be done there isn't a current VMCS.
-                */
-               nested_vmx_failInvalid(vcpu);
-               return;
-       }
-       vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
-                       & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
-                           X86_EFLAGS_SF | X86_EFLAGS_OF))
-                       | X86_EFLAGS_ZF);
-       get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
-       /*
-        * We don't need to force a shadow sync because
-        * VM_INSTRUCTION_ERROR is not shadowed
-        */
-}
-
 /* Emulate the VMCLEAR instruction */
 static int handle_vmclear(struct kvm_vcpu *vcpu)
 {
@@ -5972,8 +6027,8 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
        unsigned long field;
        u64 field_value;
        struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
-       unsigned long *fields = (unsigned long *)shadow_read_write_fields;
-       int num_fields = max_shadow_read_write_fields;
+       const unsigned long *fields = shadow_read_write_fields;
+       const int num_fields = max_shadow_read_write_fields;
 
        vmcs_load(shadow_vmcs);
 
@@ -6002,12 +6057,11 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 {
-       unsigned long *fields[] = {
-               (unsigned long *)shadow_read_write_fields,
-               (unsigned long *)shadow_read_only_fields
+       const unsigned long *fields[] = {
+               shadow_read_write_fields,
+               shadow_read_only_fields
        };
-       int num_lists =  ARRAY_SIZE(fields);
-       int max_fields[] = {
+       const int max_fields[] = {
                max_shadow_read_write_fields,
                max_shadow_read_only_fields
        };
@@ -6018,7 +6072,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 
        vmcs_load(shadow_vmcs);
 
-       for (q = 0; q < num_lists; q++) {
+       for (q = 0; q < ARRAY_SIZE(fields); q++) {
                for (i = 0; i < max_fields[q]; i++) {
                        field = fields[q][i];
                        vmcs12_read_any(&vmx->vcpu, field, &field_value);
@@ -6248,6 +6302,74 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+/* Emulate the INVEPT instruction */
+static int handle_invept(struct kvm_vcpu *vcpu)
+{
+       u32 vmx_instruction_info, types;
+       unsigned long type;
+       gva_t gva;
+       struct x86_exception e;
+       struct {
+               u64 eptp, gpa;
+       } operand;
+       u64 eptp_mask = ((1ull << 51) - 1) & PAGE_MASK;
+
+       if (!(nested_vmx_secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) ||
+           !(nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+       types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
+
+       if (!(types & (1UL << type))) {
+               nested_vmx_failValid(vcpu,
+                               VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               return 1;
+       }
+
+       /* According to the Intel VMX instruction reference, the memory
+        * operand is read even if it isn't needed (e.g., for type==global)
+        */
+       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                       vmx_instruction_info, &gva))
+               return 1;
+       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
+                               sizeof(operand), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+
+       switch (type) {
+       case VMX_EPT_EXTENT_CONTEXT:
+               if ((operand.eptp & eptp_mask) !=
+                               (nested_ept_get_cr3(vcpu) & eptp_mask))
+                       break;
+       case VMX_EPT_EXTENT_GLOBAL:
+               kvm_mmu_sync_roots(vcpu);
+               kvm_mmu_flush_tlb(vcpu);
+               nested_vmx_succeed(vcpu);
+               break;
+       default:
+               BUG_ON(1);
+               break;
+       }
+
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -6292,6 +6414,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
        [EXIT_REASON_MWAIT_INSTRUCTION]       = handle_invalid_op,
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_invalid_op,
+       [EXIT_REASON_INVEPT]                  = handle_invept,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -6518,6 +6641,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
        case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
        case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
        case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+       case EXIT_REASON_INVEPT:
                /*
                 * VMX instructions trap unconditionally. This allows L1 to
                 * emulate them for its L2 guest, i.e., allows 3-level nesting!
@@ -6550,7 +6674,20 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12,
                        SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
        case EXIT_REASON_EPT_VIOLATION:
+               /*
+                * L0 always deals with the EPT violation. If nested EPT is
+                * used, and the nested mmu code discovers that the address is
+                * missing in the guest EPT table (EPT12), the EPT violation
+                * will be injected with nested_ept_inject_page_fault()
+                */
+               return 0;
        case EXIT_REASON_EPT_MISCONFIG:
+               /*
+                * L2 never uses directly L1's EPT, but rather L0's own EPT
+                * table (shadow on EPT) or a merged EPT table that L0 built
+                * (EPT on EPT). So any problems with the structure of the
+                * table is L0's fault.
+                */
                return 0;
        case EXIT_REASON_PREEMPTION_TIMER:
                return vmcs12->pin_based_vm_exec_control &
@@ -6638,7 +6775,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-                                       get_vmcs12(vcpu), vcpu)))) {
+                                       get_vmcs12(vcpu))))) {
                if (vmx_interrupt_allowed(vcpu)) {
                        vmx->soft_vnmi_blocked = 0;
                } else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -7326,6 +7463,48 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
                entry->ecx |= bit(X86_FEATURE_VMX);
 }
 
+static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
+               struct x86_exception *fault)
+{
+       struct vmcs12 *vmcs12;
+       nested_vmx_vmexit(vcpu);
+       vmcs12 = get_vmcs12(vcpu);
+
+       if (fault->error_code & PFERR_RSVD_MASK)
+               vmcs12->vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
+       else
+               vmcs12->vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
+       vmcs12->exit_qualification = vcpu->arch.exit_qualification;
+       vmcs12->guest_physical_address = fault->address;
+}
+
+/* Callbacks for nested_ept_init_mmu_context: */
+
+static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+{
+       /* return the page table to be shadowed - in our case, EPT12 */
+       return get_vmcs12(vcpu)->ept_pointer;
+}
+
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+{
+       int r = kvm_init_shadow_ept_mmu(vcpu, &vcpu->arch.mmu,
+                       nested_vmx_ept_caps & VMX_EPT_EXECUTE_ONLY_BIT);
+
+       vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
+       vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
+       vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
+
+       vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+
+       return r;
+}
+
+static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
+{
+       vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -7388,7 +7567,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmcs12->guest_interruptibility_info);
        vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
        kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
-       vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+       vmx_set_rflags(vcpu, vmcs12->guest_rflags);
        vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
                vmcs12->guest_pending_dbg_exceptions);
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
@@ -7508,15 +7687,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
        vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 
-       /* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
-       vmcs_write32(VM_EXIT_CONTROLS,
-               vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
-       vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+       /* L2->L1 exit controls are emulated - the hardware exit is to L0 so
+        * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
+        * bits are further modified by vmx_set_efer() below.
+        */
+       vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
+
+       /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
+        * emulated by vmx_set_efer(), below.
+        */
+       vmcs_write32(VM_ENTRY_CONTROLS,
+               (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
+                       ~VM_ENTRY_IA32E_MODE) |
                (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
 
-       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+       if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
-       else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+               vcpu->arch.pat = vmcs12->guest_ia32_pat;
+       } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
                vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
 
 
@@ -7538,6 +7726,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                vmx_flush_tlb(vcpu);
        }
 
+       if (nested_cpu_has_ept(vmcs12)) {
+               kvm_mmu_unload(vcpu);
+               nested_ept_init_mmu_context(vcpu);
+       }
+
        if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->guest_ia32_efer;
        else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
@@ -7565,6 +7758,16 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        kvm_set_cr3(vcpu, vmcs12->guest_cr3);
        kvm_mmu_reset_context(vcpu);
 
+       /*
+        * L1 may access the L2's PDPTR, so save them to construct vmcs12
+        */
+       if (enable_ept) {
+               vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
+               vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
+               vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
+               vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
+       }
+
        kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
        kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
 }
@@ -7887,6 +8090,22 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs12->guest_pending_dbg_exceptions =
                vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 
+       /*
+        * In some cases (usually, nested EPT), L2 is allowed to change its
+        * own CR3 without exiting. If it has changed it, we must keep it.
+        * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
+        * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
+        *
+        * Additionally, restore L2's PDPTR to vmcs12.
+        */
+       if (enable_ept) {
+               vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
+               vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
+               vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
+               vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
+               vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
+       }
+
        vmcs12->vm_entry_controls =
                (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
                (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
@@ -7948,6 +8167,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
                                   struct vmcs12 *vmcs12)
 {
+       struct kvm_segment seg;
+
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
                vcpu->arch.efer = vmcs12->host_ia32_efer;
        else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
@@ -7982,7 +8203,9 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
        kvm_set_cr4(vcpu, vmcs12->host_cr4);
 
-       /* shadow page tables on either EPT or shadow page tables */
+       if (nested_cpu_has_ept(vmcs12))
+               nested_ept_uninit_mmu_context(vcpu);
+
        kvm_set_cr3(vcpu, vmcs12->host_cr3);
        kvm_mmu_reset_context(vcpu);
 
@@ -8001,23 +8224,61 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
        vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
        vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
-       vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
-       vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
-       vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
-       vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
-       vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
-       vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
-       vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
-       vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
-       vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
-       vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
-
-       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+
+       if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
                vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+               vcpu->arch.pat = vmcs12->host_ia32_pat;
+       }
        if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
                vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
                        vmcs12->host_ia32_perf_global_ctrl);
 
+       /* Set L1 segment info according to Intel SDM
+           27.5.2 Loading Host Segment and Descriptor-Table Registers */
+       seg = (struct kvm_segment) {
+               .base = 0,
+               .limit = 0xFFFFFFFF,
+               .selector = vmcs12->host_cs_selector,
+               .type = 11,
+               .present = 1,
+               .s = 1,
+               .g = 1
+       };
+       if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+               seg.l = 1;
+       else
+               seg.db = 1;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
+       seg = (struct kvm_segment) {
+               .base = 0,
+               .limit = 0xFFFFFFFF,
+               .type = 3,
+               .present = 1,
+               .s = 1,
+               .db = 1,
+               .g = 1
+       };
+       seg.selector = vmcs12->host_ds_selector;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
+       seg.selector = vmcs12->host_es_selector;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
+       seg.selector = vmcs12->host_ss_selector;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
+       seg.selector = vmcs12->host_fs_selector;
+       seg.base = vmcs12->host_fs_base;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
+       seg.selector = vmcs12->host_gs_selector;
+       seg.base = vmcs12->host_gs_base;
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
+       seg = (struct kvm_segment) {
+               .base = vmcs12->host_tr_base,
+               .limit = 0x67,
+               .selector = vmcs12->host_tr_selector,
+               .type = 11,
+               .present = 1
+       };
+       vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
+
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 }