]> Pileus Git - ~andy/linux/blob - arch/x86/kvm/vmx.c
KVM: VMX: Fix reporting of unhandled EPT violations
[~andy/linux] / arch / x86 / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "irq.h"
19 #include "mmu.h"
20
21 #include <linux/kvm_host.h>
22 #include <linux/module.h>
23 #include <linux/kernel.h>
24 #include <linux/mm.h>
25 #include <linux/highmem.h>
26 #include <linux/sched.h>
27 #include <linux/moduleparam.h>
28 #include "kvm_cache_regs.h"
29 #include "x86.h"
30
31 #include <asm/io.h>
32 #include <asm/desc.h>
33 #include <asm/vmx.h>
34 #include <asm/virtext.h>
35 #include <asm/mce.h>
36
37 #define __ex(x) __kvm_handle_fault_on_reboot(x)
38
39 MODULE_AUTHOR("Qumranet");
40 MODULE_LICENSE("GPL");
41
42 static int __read_mostly bypass_guest_pf = 1;
43 module_param(bypass_guest_pf, bool, S_IRUGO);
44
45 static int __read_mostly enable_vpid = 1;
46 module_param_named(vpid, enable_vpid, bool, 0444);
47
48 static int __read_mostly flexpriority_enabled = 1;
49 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
50
51 static int __read_mostly enable_ept = 1;
52 module_param_named(ept, enable_ept, bool, S_IRUGO);
53
54 static int __read_mostly emulate_invalid_guest_state = 0;
55 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
56
57 struct vmcs {
58         u32 revision_id;
59         u32 abort;
60         char data[0];
61 };
62
63 struct vcpu_vmx {
64         struct kvm_vcpu       vcpu;
65         struct list_head      local_vcpus_link;
66         unsigned long         host_rsp;
67         int                   launched;
68         u8                    fail;
69         u32                   idt_vectoring_info;
70         struct kvm_msr_entry *guest_msrs;
71         struct kvm_msr_entry *host_msrs;
72         int                   nmsrs;
73         int                   save_nmsrs;
74         int                   msr_offset_efer;
75 #ifdef CONFIG_X86_64
76         int                   msr_offset_kernel_gs_base;
77 #endif
78         struct vmcs          *vmcs;
79         struct {
80                 int           loaded;
81                 u16           fs_sel, gs_sel, ldt_sel;
82                 int           gs_ldt_reload_needed;
83                 int           fs_reload_needed;
84                 int           guest_efer_loaded;
85         } host_state;
86         struct {
87                 struct {
88                         bool pending;
89                         u8 vector;
90                         unsigned rip;
91                 } irq;
92         } rmode;
93         int vpid;
94         bool emulation_required;
95         enum emulation_result invalid_state_emulation_result;
96
97         /* Support for vnmi-less CPUs */
98         int soft_vnmi_blocked;
99         ktime_t entry_time;
100         s64 vnmi_blocked_time;
101         u32 exit_reason;
102 };
103
104 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
105 {
106         return container_of(vcpu, struct vcpu_vmx, vcpu);
107 }
108
109 static int init_rmode(struct kvm *kvm);
110 static u64 construct_eptp(unsigned long root_hpa);
111
112 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
113 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
114 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
115
116 static unsigned long *vmx_io_bitmap_a;
117 static unsigned long *vmx_io_bitmap_b;
118 static unsigned long *vmx_msr_bitmap_legacy;
119 static unsigned long *vmx_msr_bitmap_longmode;
120
121 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
122 static DEFINE_SPINLOCK(vmx_vpid_lock);
123
124 static struct vmcs_config {
125         int size;
126         int order;
127         u32 revision_id;
128         u32 pin_based_exec_ctrl;
129         u32 cpu_based_exec_ctrl;
130         u32 cpu_based_2nd_exec_ctrl;
131         u32 vmexit_ctrl;
132         u32 vmentry_ctrl;
133 } vmcs_config;
134
135 static struct vmx_capability {
136         u32 ept;
137         u32 vpid;
138 } vmx_capability;
139
140 #define VMX_SEGMENT_FIELD(seg)                                  \
141         [VCPU_SREG_##seg] = {                                   \
142                 .selector = GUEST_##seg##_SELECTOR,             \
143                 .base = GUEST_##seg##_BASE,                     \
144                 .limit = GUEST_##seg##_LIMIT,                   \
145                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
146         }
147
148 static struct kvm_vmx_segment_field {
149         unsigned selector;
150         unsigned base;
151         unsigned limit;
152         unsigned ar_bytes;
153 } kvm_vmx_segment_fields[] = {
154         VMX_SEGMENT_FIELD(CS),
155         VMX_SEGMENT_FIELD(DS),
156         VMX_SEGMENT_FIELD(ES),
157         VMX_SEGMENT_FIELD(FS),
158         VMX_SEGMENT_FIELD(GS),
159         VMX_SEGMENT_FIELD(SS),
160         VMX_SEGMENT_FIELD(TR),
161         VMX_SEGMENT_FIELD(LDTR),
162 };
163
164 static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
165
166 /*
167  * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it
168  * away by decrementing the array size.
169  */
170 static const u32 vmx_msr_index[] = {
171 #ifdef CONFIG_X86_64
172         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
173 #endif
174         MSR_EFER, MSR_K6_STAR,
175 };
176 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
177
178 static void load_msrs(struct kvm_msr_entry *e, int n)
179 {
180         int i;
181
182         for (i = 0; i < n; ++i)
183                 wrmsrl(e[i].index, e[i].data);
184 }
185
186 static void save_msrs(struct kvm_msr_entry *e, int n)
187 {
188         int i;
189
190         for (i = 0; i < n; ++i)
191                 rdmsrl(e[i].index, e[i].data);
192 }
193
194 static inline int is_page_fault(u32 intr_info)
195 {
196         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
197                              INTR_INFO_VALID_MASK)) ==
198                 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
199 }
200
201 static inline int is_no_device(u32 intr_info)
202 {
203         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
204                              INTR_INFO_VALID_MASK)) ==
205                 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
206 }
207
208 static inline int is_invalid_opcode(u32 intr_info)
209 {
210         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
211                              INTR_INFO_VALID_MASK)) ==
212                 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
213 }
214
215 static inline int is_external_interrupt(u32 intr_info)
216 {
217         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
218                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
219 }
220
221 static inline int is_machine_check(u32 intr_info)
222 {
223         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
224                              INTR_INFO_VALID_MASK)) ==
225                 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
226 }
227
228 static inline int cpu_has_vmx_msr_bitmap(void)
229 {
230         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
231 }
232
233 static inline int cpu_has_vmx_tpr_shadow(void)
234 {
235         return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
236 }
237
238 static inline int vm_need_tpr_shadow(struct kvm *kvm)
239 {
240         return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
241 }
242
243 static inline int cpu_has_secondary_exec_ctrls(void)
244 {
245         return vmcs_config.cpu_based_exec_ctrl &
246                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
247 }
248
249 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
250 {
251         return vmcs_config.cpu_based_2nd_exec_ctrl &
252                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
253 }
254
255 static inline bool cpu_has_vmx_flexpriority(void)
256 {
257         return cpu_has_vmx_tpr_shadow() &&
258                 cpu_has_vmx_virtualize_apic_accesses();
259 }
260
261 static inline int cpu_has_vmx_invept_individual_addr(void)
262 {
263         return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
264 }
265
266 static inline int cpu_has_vmx_invept_context(void)
267 {
268         return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
269 }
270
271 static inline int cpu_has_vmx_invept_global(void)
272 {
273         return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
274 }
275
276 static inline int cpu_has_vmx_ept(void)
277 {
278         return vmcs_config.cpu_based_2nd_exec_ctrl &
279                 SECONDARY_EXEC_ENABLE_EPT;
280 }
281
282 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
283 {
284         return flexpriority_enabled &&
285                 (cpu_has_vmx_virtualize_apic_accesses()) &&
286                 (irqchip_in_kernel(kvm));
287 }
288
289 static inline int cpu_has_vmx_vpid(void)
290 {
291         return vmcs_config.cpu_based_2nd_exec_ctrl &
292                 SECONDARY_EXEC_ENABLE_VPID;
293 }
294
295 static inline int cpu_has_virtual_nmis(void)
296 {
297         return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
298 }
299
300 static inline bool report_flexpriority(void)
301 {
302         return flexpriority_enabled;
303 }
304
305 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
306 {
307         int i;
308
309         for (i = 0; i < vmx->nmsrs; ++i)
310                 if (vmx->guest_msrs[i].index == msr)
311                         return i;
312         return -1;
313 }
314
315 static inline void __invvpid(int ext, u16 vpid, gva_t gva)
316 {
317     struct {
318         u64 vpid : 16;
319         u64 rsvd : 48;
320         u64 gva;
321     } operand = { vpid, 0, gva };
322
323     asm volatile (__ex(ASM_VMX_INVVPID)
324                   /* CF==1 or ZF==1 --> rc = -1 */
325                   "; ja 1f ; ud2 ; 1:"
326                   : : "a"(&operand), "c"(ext) : "cc", "memory");
327 }
328
329 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
330 {
331         struct {
332                 u64 eptp, gpa;
333         } operand = {eptp, gpa};
334
335         asm volatile (__ex(ASM_VMX_INVEPT)
336                         /* CF==1 or ZF==1 --> rc = -1 */
337                         "; ja 1f ; ud2 ; 1:\n"
338                         : : "a" (&operand), "c" (ext) : "cc", "memory");
339 }
340
341 static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
342 {
343         int i;
344
345         i = __find_msr_index(vmx, msr);
346         if (i >= 0)
347                 return &vmx->guest_msrs[i];
348         return NULL;
349 }
350
351 static void vmcs_clear(struct vmcs *vmcs)
352 {
353         u64 phys_addr = __pa(vmcs);
354         u8 error;
355
356         asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
357                       : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
358                       : "cc", "memory");
359         if (error)
360                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
361                        vmcs, phys_addr);
362 }
363
364 static void __vcpu_clear(void *arg)
365 {
366         struct vcpu_vmx *vmx = arg;
367         int cpu = raw_smp_processor_id();
368
369         if (vmx->vcpu.cpu == cpu)
370                 vmcs_clear(vmx->vmcs);
371         if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
372                 per_cpu(current_vmcs, cpu) = NULL;
373         rdtscll(vmx->vcpu.arch.host_tsc);
374         list_del(&vmx->local_vcpus_link);
375         vmx->vcpu.cpu = -1;
376         vmx->launched = 0;
377 }
378
379 static void vcpu_clear(struct vcpu_vmx *vmx)
380 {
381         if (vmx->vcpu.cpu == -1)
382                 return;
383         smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
384 }
385
386 static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
387 {
388         if (vmx->vpid == 0)
389                 return;
390
391         __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
392 }
393
394 static inline void ept_sync_global(void)
395 {
396         if (cpu_has_vmx_invept_global())
397                 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
398 }
399
400 static inline void ept_sync_context(u64 eptp)
401 {
402         if (enable_ept) {
403                 if (cpu_has_vmx_invept_context())
404                         __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
405                 else
406                         ept_sync_global();
407         }
408 }
409
410 static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
411 {
412         if (enable_ept) {
413                 if (cpu_has_vmx_invept_individual_addr())
414                         __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
415                                         eptp, gpa);
416                 else
417                         ept_sync_context(eptp);
418         }
419 }
420
421 static unsigned long vmcs_readl(unsigned long field)
422 {
423         unsigned long value;
424
425         asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
426                       : "=a"(value) : "d"(field) : "cc");
427         return value;
428 }
429
430 static u16 vmcs_read16(unsigned long field)
431 {
432         return vmcs_readl(field);
433 }
434
435 static u32 vmcs_read32(unsigned long field)
436 {
437         return vmcs_readl(field);
438 }
439
440 static u64 vmcs_read64(unsigned long field)
441 {
442 #ifdef CONFIG_X86_64
443         return vmcs_readl(field);
444 #else
445         return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
446 #endif
447 }
448
449 static noinline void vmwrite_error(unsigned long field, unsigned long value)
450 {
451         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
452                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
453         dump_stack();
454 }
455
456 static void vmcs_writel(unsigned long field, unsigned long value)
457 {
458         u8 error;
459
460         asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
461                        : "=q"(error) : "a"(value), "d"(field) : "cc");
462         if (unlikely(error))
463                 vmwrite_error(field, value);
464 }
465
466 static void vmcs_write16(unsigned long field, u16 value)
467 {
468         vmcs_writel(field, value);
469 }
470
471 static void vmcs_write32(unsigned long field, u32 value)
472 {
473         vmcs_writel(field, value);
474 }
475
476 static void vmcs_write64(unsigned long field, u64 value)
477 {
478         vmcs_writel(field, value);
479 #ifndef CONFIG_X86_64
480         asm volatile ("");
481         vmcs_writel(field+1, value >> 32);
482 #endif
483 }
484
485 static void vmcs_clear_bits(unsigned long field, u32 mask)
486 {
487         vmcs_writel(field, vmcs_readl(field) & ~mask);
488 }
489
490 static void vmcs_set_bits(unsigned long field, u32 mask)
491 {
492         vmcs_writel(field, vmcs_readl(field) | mask);
493 }
494
495 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
496 {
497         u32 eb;
498
499         eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
500         if (!vcpu->fpu_active)
501                 eb |= 1u << NM_VECTOR;
502         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
503                 if (vcpu->guest_debug &
504                     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
505                         eb |= 1u << DB_VECTOR;
506                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
507                         eb |= 1u << BP_VECTOR;
508         }
509         if (vcpu->arch.rmode.vm86_active)
510                 eb = ~0;
511         if (enable_ept)
512                 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
513         vmcs_write32(EXCEPTION_BITMAP, eb);
514 }
515
516 static void reload_tss(void)
517 {
518         /*
519          * VT restores TR but not its size.  Useless.
520          */
521         struct descriptor_table gdt;
522         struct desc_struct *descs;
523
524         kvm_get_gdt(&gdt);
525         descs = (void *)gdt.base;
526         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
527         load_TR_desc();
528 }
529
530 static void load_transition_efer(struct vcpu_vmx *vmx)
531 {
532         int efer_offset = vmx->msr_offset_efer;
533         u64 host_efer = vmx->host_msrs[efer_offset].data;
534         u64 guest_efer = vmx->guest_msrs[efer_offset].data;
535         u64 ignore_bits;
536
537         if (efer_offset < 0)
538                 return;
539         /*
540          * NX is emulated; LMA and LME handled by hardware; SCE meaninless
541          * outside long mode
542          */
543         ignore_bits = EFER_NX | EFER_SCE;
544 #ifdef CONFIG_X86_64
545         ignore_bits |= EFER_LMA | EFER_LME;
546         /* SCE is meaningful only in long mode on Intel */
547         if (guest_efer & EFER_LMA)
548                 ignore_bits &= ~(u64)EFER_SCE;
549 #endif
550         if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
551                 return;
552
553         vmx->host_state.guest_efer_loaded = 1;
554         guest_efer &= ~ignore_bits;
555         guest_efer |= host_efer & ignore_bits;
556         wrmsrl(MSR_EFER, guest_efer);
557         vmx->vcpu.stat.efer_reload++;
558 }
559
560 static void reload_host_efer(struct vcpu_vmx *vmx)
561 {
562         if (vmx->host_state.guest_efer_loaded) {
563                 vmx->host_state.guest_efer_loaded = 0;
564                 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
565         }
566 }
567
568 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
569 {
570         struct vcpu_vmx *vmx = to_vmx(vcpu);
571
572         if (vmx->host_state.loaded)
573                 return;
574
575         vmx->host_state.loaded = 1;
576         /*
577          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
578          * allow segment selectors with cpl > 0 or ti == 1.
579          */
580         vmx->host_state.ldt_sel = kvm_read_ldt();
581         vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
582         vmx->host_state.fs_sel = kvm_read_fs();
583         if (!(vmx->host_state.fs_sel & 7)) {
584                 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
585                 vmx->host_state.fs_reload_needed = 0;
586         } else {
587                 vmcs_write16(HOST_FS_SELECTOR, 0);
588                 vmx->host_state.fs_reload_needed = 1;
589         }
590         vmx->host_state.gs_sel = kvm_read_gs();
591         if (!(vmx->host_state.gs_sel & 7))
592                 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
593         else {
594                 vmcs_write16(HOST_GS_SELECTOR, 0);
595                 vmx->host_state.gs_ldt_reload_needed = 1;
596         }
597
598 #ifdef CONFIG_X86_64
599         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
600         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
601 #else
602         vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
603         vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
604 #endif
605
606 #ifdef CONFIG_X86_64
607         if (is_long_mode(&vmx->vcpu))
608                 save_msrs(vmx->host_msrs +
609                           vmx->msr_offset_kernel_gs_base, 1);
610
611 #endif
612         load_msrs(vmx->guest_msrs, vmx->save_nmsrs);
613         load_transition_efer(vmx);
614 }
615
616 static void __vmx_load_host_state(struct vcpu_vmx *vmx)
617 {
618         unsigned long flags;
619
620         if (!vmx->host_state.loaded)
621                 return;
622
623         ++vmx->vcpu.stat.host_state_reload;
624         vmx->host_state.loaded = 0;
625         if (vmx->host_state.fs_reload_needed)
626                 kvm_load_fs(vmx->host_state.fs_sel);
627         if (vmx->host_state.gs_ldt_reload_needed) {
628                 kvm_load_ldt(vmx->host_state.ldt_sel);
629                 /*
630                  * If we have to reload gs, we must take care to
631                  * preserve our gs base.
632                  */
633                 local_irq_save(flags);
634                 kvm_load_gs(vmx->host_state.gs_sel);
635 #ifdef CONFIG_X86_64
636                 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
637 #endif
638                 local_irq_restore(flags);
639         }
640         reload_tss();
641         save_msrs(vmx->guest_msrs, vmx->save_nmsrs);
642         load_msrs(vmx->host_msrs, vmx->save_nmsrs);
643         reload_host_efer(vmx);
644 }
645
646 static void vmx_load_host_state(struct vcpu_vmx *vmx)
647 {
648         preempt_disable();
649         __vmx_load_host_state(vmx);
650         preempt_enable();
651 }
652
653 /*
654  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
655  * vcpu mutex is already taken.
656  */
657 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
658 {
659         struct vcpu_vmx *vmx = to_vmx(vcpu);
660         u64 phys_addr = __pa(vmx->vmcs);
661         u64 tsc_this, delta, new_offset;
662
663         if (vcpu->cpu != cpu) {
664                 vcpu_clear(vmx);
665                 kvm_migrate_timers(vcpu);
666                 vpid_sync_vcpu_all(vmx);
667                 local_irq_disable();
668                 list_add(&vmx->local_vcpus_link,
669                          &per_cpu(vcpus_on_cpu, cpu));
670                 local_irq_enable();
671         }
672
673         if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
674                 u8 error;
675
676                 per_cpu(current_vmcs, cpu) = vmx->vmcs;
677                 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
678                               : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
679                               : "cc");
680                 if (error)
681                         printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
682                                vmx->vmcs, phys_addr);
683         }
684
685         if (vcpu->cpu != cpu) {
686                 struct descriptor_table dt;
687                 unsigned long sysenter_esp;
688
689                 vcpu->cpu = cpu;
690                 /*
691                  * Linux uses per-cpu TSS and GDT, so set these when switching
692                  * processors.
693                  */
694                 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
695                 kvm_get_gdt(&dt);
696                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
697
698                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
699                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
700
701                 /*
702                  * Make sure the time stamp counter is monotonous.
703                  */
704                 rdtscll(tsc_this);
705                 if (tsc_this < vcpu->arch.host_tsc) {
706                         delta = vcpu->arch.host_tsc - tsc_this;
707                         new_offset = vmcs_read64(TSC_OFFSET) + delta;
708                         vmcs_write64(TSC_OFFSET, new_offset);
709                 }
710         }
711 }
712
713 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
714 {
715         __vmx_load_host_state(to_vmx(vcpu));
716 }
717
718 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
719 {
720         if (vcpu->fpu_active)
721                 return;
722         vcpu->fpu_active = 1;
723         vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
724         if (vcpu->arch.cr0 & X86_CR0_TS)
725                 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
726         update_exception_bitmap(vcpu);
727 }
728
729 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
730 {
731         if (!vcpu->fpu_active)
732                 return;
733         vcpu->fpu_active = 0;
734         vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
735         update_exception_bitmap(vcpu);
736 }
737
738 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
739 {
740         return vmcs_readl(GUEST_RFLAGS);
741 }
742
743 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
744 {
745         if (vcpu->arch.rmode.vm86_active)
746                 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
747         vmcs_writel(GUEST_RFLAGS, rflags);
748 }
749
750 static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
751 {
752         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
753         int ret = 0;
754
755         if (interruptibility & GUEST_INTR_STATE_STI)
756                 ret |= X86_SHADOW_INT_STI;
757         if (interruptibility & GUEST_INTR_STATE_MOV_SS)
758                 ret |= X86_SHADOW_INT_MOV_SS;
759
760         return ret & mask;
761 }
762
763 static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
764 {
765         u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
766         u32 interruptibility = interruptibility_old;
767
768         interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
769
770         if (mask & X86_SHADOW_INT_MOV_SS)
771                 interruptibility |= GUEST_INTR_STATE_MOV_SS;
772         if (mask & X86_SHADOW_INT_STI)
773                 interruptibility |= GUEST_INTR_STATE_STI;
774
775         if ((interruptibility != interruptibility_old))
776                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
777 }
778
779 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
780 {
781         unsigned long rip;
782
783         rip = kvm_rip_read(vcpu);
784         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
785         kvm_rip_write(vcpu, rip);
786
787         /* skipping an emulated instruction also counts */
788         vmx_set_interrupt_shadow(vcpu, 0);
789 }
790
791 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
792                                 bool has_error_code, u32 error_code)
793 {
794         struct vcpu_vmx *vmx = to_vmx(vcpu);
795         u32 intr_info = nr | INTR_INFO_VALID_MASK;
796
797         if (has_error_code) {
798                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
799                 intr_info |= INTR_INFO_DELIVER_CODE_MASK;
800         }
801
802         if (vcpu->arch.rmode.vm86_active) {
803                 vmx->rmode.irq.pending = true;
804                 vmx->rmode.irq.vector = nr;
805                 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
806                 if (kvm_exception_is_soft(nr))
807                         vmx->rmode.irq.rip +=
808                                 vmx->vcpu.arch.event_exit_inst_len;
809                 intr_info |= INTR_TYPE_SOFT_INTR;
810                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
811                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
812                 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
813                 return;
814         }
815
816         if (kvm_exception_is_soft(nr)) {
817                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
818                              vmx->vcpu.arch.event_exit_inst_len);
819                 intr_info |= INTR_TYPE_SOFT_EXCEPTION;
820         } else
821                 intr_info |= INTR_TYPE_HARD_EXCEPTION;
822
823         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
824 }
825
826 /*
827  * Swap MSR entry in host/guest MSR entry array.
828  */
829 #ifdef CONFIG_X86_64
830 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
831 {
832         struct kvm_msr_entry tmp;
833
834         tmp = vmx->guest_msrs[to];
835         vmx->guest_msrs[to] = vmx->guest_msrs[from];
836         vmx->guest_msrs[from] = tmp;
837         tmp = vmx->host_msrs[to];
838         vmx->host_msrs[to] = vmx->host_msrs[from];
839         vmx->host_msrs[from] = tmp;
840 }
841 #endif
842
843 /*
844  * Set up the vmcs to automatically save and restore system
845  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
846  * mode, as fiddling with msrs is very expensive.
847  */
848 static void setup_msrs(struct vcpu_vmx *vmx)
849 {
850         int save_nmsrs;
851         unsigned long *msr_bitmap;
852
853         vmx_load_host_state(vmx);
854         save_nmsrs = 0;
855 #ifdef CONFIG_X86_64
856         if (is_long_mode(&vmx->vcpu)) {
857                 int index;
858
859                 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
860                 if (index >= 0)
861                         move_msr_up(vmx, index, save_nmsrs++);
862                 index = __find_msr_index(vmx, MSR_LSTAR);
863                 if (index >= 0)
864                         move_msr_up(vmx, index, save_nmsrs++);
865                 index = __find_msr_index(vmx, MSR_CSTAR);
866                 if (index >= 0)
867                         move_msr_up(vmx, index, save_nmsrs++);
868                 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
869                 if (index >= 0)
870                         move_msr_up(vmx, index, save_nmsrs++);
871                 /*
872                  * MSR_K6_STAR is only needed on long mode guests, and only
873                  * if efer.sce is enabled.
874                  */
875                 index = __find_msr_index(vmx, MSR_K6_STAR);
876                 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE))
877                         move_msr_up(vmx, index, save_nmsrs++);
878         }
879 #endif
880         vmx->save_nmsrs = save_nmsrs;
881
882 #ifdef CONFIG_X86_64
883         vmx->msr_offset_kernel_gs_base =
884                 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
885 #endif
886         vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
887
888         if (cpu_has_vmx_msr_bitmap()) {
889                 if (is_long_mode(&vmx->vcpu))
890                         msr_bitmap = vmx_msr_bitmap_longmode;
891                 else
892                         msr_bitmap = vmx_msr_bitmap_legacy;
893
894                 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
895         }
896 }
897
898 /*
899  * reads and returns guest's timestamp counter "register"
900  * guest_tsc = host_tsc + tsc_offset    -- 21.3
901  */
902 static u64 guest_read_tsc(void)
903 {
904         u64 host_tsc, tsc_offset;
905
906         rdtscll(host_tsc);
907         tsc_offset = vmcs_read64(TSC_OFFSET);
908         return host_tsc + tsc_offset;
909 }
910
911 /*
912  * writes 'guest_tsc' into guest's timestamp counter "register"
913  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
914  */
915 static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
916 {
917         vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
918 }
919
920 /*
921  * Reads an msr value (of 'msr_index') into 'pdata'.
922  * Returns 0 on success, non-0 otherwise.
923  * Assumes vcpu_load() was already called.
924  */
925 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
926 {
927         u64 data;
928         struct kvm_msr_entry *msr;
929
930         if (!pdata) {
931                 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
932                 return -EINVAL;
933         }
934
935         switch (msr_index) {
936 #ifdef CONFIG_X86_64
937         case MSR_FS_BASE:
938                 data = vmcs_readl(GUEST_FS_BASE);
939                 break;
940         case MSR_GS_BASE:
941                 data = vmcs_readl(GUEST_GS_BASE);
942                 break;
943         case MSR_EFER:
944                 return kvm_get_msr_common(vcpu, msr_index, pdata);
945 #endif
946         case MSR_IA32_TSC:
947                 data = guest_read_tsc();
948                 break;
949         case MSR_IA32_SYSENTER_CS:
950                 data = vmcs_read32(GUEST_SYSENTER_CS);
951                 break;
952         case MSR_IA32_SYSENTER_EIP:
953                 data = vmcs_readl(GUEST_SYSENTER_EIP);
954                 break;
955         case MSR_IA32_SYSENTER_ESP:
956                 data = vmcs_readl(GUEST_SYSENTER_ESP);
957                 break;
958         default:
959                 vmx_load_host_state(to_vmx(vcpu));
960                 msr = find_msr_entry(to_vmx(vcpu), msr_index);
961                 if (msr) {
962                         data = msr->data;
963                         break;
964                 }
965                 return kvm_get_msr_common(vcpu, msr_index, pdata);
966         }
967
968         *pdata = data;
969         return 0;
970 }
971
972 /*
973  * Writes msr value into into the appropriate "register".
974  * Returns 0 on success, non-0 otherwise.
975  * Assumes vcpu_load() was already called.
976  */
977 static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
978 {
979         struct vcpu_vmx *vmx = to_vmx(vcpu);
980         struct kvm_msr_entry *msr;
981         u64 host_tsc;
982         int ret = 0;
983
984         switch (msr_index) {
985         case MSR_EFER:
986                 vmx_load_host_state(vmx);
987                 ret = kvm_set_msr_common(vcpu, msr_index, data);
988                 break;
989 #ifdef CONFIG_X86_64
990         case MSR_FS_BASE:
991                 vmcs_writel(GUEST_FS_BASE, data);
992                 break;
993         case MSR_GS_BASE:
994                 vmcs_writel(GUEST_GS_BASE, data);
995                 break;
996 #endif
997         case MSR_IA32_SYSENTER_CS:
998                 vmcs_write32(GUEST_SYSENTER_CS, data);
999                 break;
1000         case MSR_IA32_SYSENTER_EIP:
1001                 vmcs_writel(GUEST_SYSENTER_EIP, data);
1002                 break;
1003         case MSR_IA32_SYSENTER_ESP:
1004                 vmcs_writel(GUEST_SYSENTER_ESP, data);
1005                 break;
1006         case MSR_IA32_TSC:
1007                 rdtscll(host_tsc);
1008                 guest_write_tsc(data, host_tsc);
1009                 break;
1010         case MSR_P6_PERFCTR0:
1011         case MSR_P6_PERFCTR1:
1012         case MSR_P6_EVNTSEL0:
1013         case MSR_P6_EVNTSEL1:
1014                 /*
1015                  * Just discard all writes to the performance counters; this
1016                  * should keep both older linux and windows 64-bit guests
1017                  * happy
1018                  */
1019                 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
1020
1021                 break;
1022         case MSR_IA32_CR_PAT:
1023                 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
1024                         vmcs_write64(GUEST_IA32_PAT, data);
1025                         vcpu->arch.pat = data;
1026                         break;
1027                 }
1028                 /* Otherwise falls through to kvm_set_msr_common */
1029         default:
1030                 vmx_load_host_state(vmx);
1031                 msr = find_msr_entry(vmx, msr_index);
1032                 if (msr) {
1033                         msr->data = data;
1034                         break;
1035                 }
1036                 ret = kvm_set_msr_common(vcpu, msr_index, data);
1037         }
1038
1039         return ret;
1040 }
1041
1042 static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1043 {
1044         __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
1045         switch (reg) {
1046         case VCPU_REGS_RSP:
1047                 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
1048                 break;
1049         case VCPU_REGS_RIP:
1050                 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
1051                 break;
1052         case VCPU_EXREG_PDPTR:
1053                 if (enable_ept)
1054                         ept_save_pdptrs(vcpu);
1055                 break;
1056         default:
1057                 break;
1058         }
1059 }
1060
1061 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1062 {
1063         int old_debug = vcpu->guest_debug;
1064         unsigned long flags;
1065
1066         vcpu->guest_debug = dbg->control;
1067         if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1068                 vcpu->guest_debug = 0;
1069
1070         if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1071                 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1072         else
1073                 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1074
1075         flags = vmcs_readl(GUEST_RFLAGS);
1076         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1077                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1078         else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1079                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1080         vmcs_writel(GUEST_RFLAGS, flags);
1081
1082         update_exception_bitmap(vcpu);
1083
1084         return 0;
1085 }
1086
1087 static __init int cpu_has_kvm_support(void)
1088 {
1089         return cpu_has_vmx();
1090 }
1091
1092 static __init int vmx_disabled_by_bios(void)
1093 {
1094         u64 msr;
1095
1096         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1097         return (msr & (FEATURE_CONTROL_LOCKED |
1098                        FEATURE_CONTROL_VMXON_ENABLED))
1099             == FEATURE_CONTROL_LOCKED;
1100         /* locked but not enabled */
1101 }
1102
1103 static void hardware_enable(void *garbage)
1104 {
1105         int cpu = raw_smp_processor_id();
1106         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1107         u64 old;
1108
1109         INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1110         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1111         if ((old & (FEATURE_CONTROL_LOCKED |
1112                     FEATURE_CONTROL_VMXON_ENABLED))
1113             != (FEATURE_CONTROL_LOCKED |
1114                 FEATURE_CONTROL_VMXON_ENABLED))
1115                 /* enable and lock */
1116                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1117                        FEATURE_CONTROL_LOCKED |
1118                        FEATURE_CONTROL_VMXON_ENABLED);
1119         write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1120         asm volatile (ASM_VMX_VMXON_RAX
1121                       : : "a"(&phys_addr), "m"(phys_addr)
1122                       : "memory", "cc");
1123 }
1124
1125 static void vmclear_local_vcpus(void)
1126 {
1127         int cpu = raw_smp_processor_id();
1128         struct vcpu_vmx *vmx, *n;
1129
1130         list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1131                                  local_vcpus_link)
1132                 __vcpu_clear(vmx);
1133 }
1134
1135
1136 /* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
1137  * tricks.
1138  */
1139 static void kvm_cpu_vmxoff(void)
1140 {
1141         asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1142         write_cr4(read_cr4() & ~X86_CR4_VMXE);
1143 }
1144
1145 static void hardware_disable(void *garbage)
1146 {
1147         vmclear_local_vcpus();
1148         kvm_cpu_vmxoff();
1149 }
1150
1151 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1152                                       u32 msr, u32 *result)
1153 {
1154         u32 vmx_msr_low, vmx_msr_high;
1155         u32 ctl = ctl_min | ctl_opt;
1156
1157         rdmsr(msr, vmx_msr_low, vmx_msr_high);
1158
1159         ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
1160         ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
1161
1162         /* Ensure minimum (required) set of control bits are supported. */
1163         if (ctl_min & ~ctl)
1164                 return -EIO;
1165
1166         *result = ctl;
1167         return 0;
1168 }
1169
1170 static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1171 {
1172         u32 vmx_msr_low, vmx_msr_high;
1173         u32 min, opt, min2, opt2;
1174         u32 _pin_based_exec_control = 0;
1175         u32 _cpu_based_exec_control = 0;
1176         u32 _cpu_based_2nd_exec_control = 0;
1177         u32 _vmexit_control = 0;
1178         u32 _vmentry_control = 0;
1179
1180         min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1181         opt = PIN_BASED_VIRTUAL_NMIS;
1182         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1183                                 &_pin_based_exec_control) < 0)
1184                 return -EIO;
1185
1186         min = CPU_BASED_HLT_EXITING |
1187 #ifdef CONFIG_X86_64
1188               CPU_BASED_CR8_LOAD_EXITING |
1189               CPU_BASED_CR8_STORE_EXITING |
1190 #endif
1191               CPU_BASED_CR3_LOAD_EXITING |
1192               CPU_BASED_CR3_STORE_EXITING |
1193               CPU_BASED_USE_IO_BITMAPS |
1194               CPU_BASED_MOV_DR_EXITING |
1195               CPU_BASED_USE_TSC_OFFSETING |
1196               CPU_BASED_INVLPG_EXITING;
1197         opt = CPU_BASED_TPR_SHADOW |
1198               CPU_BASED_USE_MSR_BITMAPS |
1199               CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
1200         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1201                                 &_cpu_based_exec_control) < 0)
1202                 return -EIO;
1203 #ifdef CONFIG_X86_64
1204         if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
1205                 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
1206                                            ~CPU_BASED_CR8_STORE_EXITING;
1207 #endif
1208         if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
1209                 min2 = 0;
1210                 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
1211                         SECONDARY_EXEC_WBINVD_EXITING |
1212                         SECONDARY_EXEC_ENABLE_VPID |
1213                         SECONDARY_EXEC_ENABLE_EPT;
1214                 if (adjust_vmx_controls(min2, opt2,
1215                                         MSR_IA32_VMX_PROCBASED_CTLS2,
1216                                         &_cpu_based_2nd_exec_control) < 0)
1217                         return -EIO;
1218         }
1219 #ifndef CONFIG_X86_64
1220         if (!(_cpu_based_2nd_exec_control &
1221                                 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
1222                 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1223 #endif
1224         if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1225                 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1226                    enabled */
1227                 min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1228                          CPU_BASED_CR3_STORE_EXITING |
1229                          CPU_BASED_INVLPG_EXITING);
1230                 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1231                                         &_cpu_based_exec_control) < 0)
1232                         return -EIO;
1233                 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
1234                       vmx_capability.ept, vmx_capability.vpid);
1235         }
1236
1237         min = 0;
1238 #ifdef CONFIG_X86_64
1239         min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
1240 #endif
1241         opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
1242         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
1243                                 &_vmexit_control) < 0)
1244                 return -EIO;
1245
1246         min = 0;
1247         opt = VM_ENTRY_LOAD_IA32_PAT;
1248         if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
1249                                 &_vmentry_control) < 0)
1250                 return -EIO;
1251
1252         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
1253
1254         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
1255         if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
1256                 return -EIO;
1257
1258 #ifdef CONFIG_X86_64
1259         /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
1260         if (vmx_msr_high & (1u<<16))
1261                 return -EIO;
1262 #endif
1263
1264         /* Require Write-Back (WB) memory type for VMCS accesses. */
1265         if (((vmx_msr_high >> 18) & 15) != 6)
1266                 return -EIO;
1267
1268         vmcs_conf->size = vmx_msr_high & 0x1fff;
1269         vmcs_conf->order = get_order(vmcs_config.size);
1270         vmcs_conf->revision_id = vmx_msr_low;
1271
1272         vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
1273         vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
1274         vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
1275         vmcs_conf->vmexit_ctrl         = _vmexit_control;
1276         vmcs_conf->vmentry_ctrl        = _vmentry_control;
1277
1278         return 0;
1279 }
1280
1281 static struct vmcs *alloc_vmcs_cpu(int cpu)
1282 {
1283         int node = cpu_to_node(cpu);
1284         struct page *pages;
1285         struct vmcs *vmcs;
1286
1287         pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
1288         if (!pages)
1289                 return NULL;
1290         vmcs = page_address(pages);
1291         memset(vmcs, 0, vmcs_config.size);
1292         vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
1293         return vmcs;
1294 }
1295
1296 static struct vmcs *alloc_vmcs(void)
1297 {
1298         return alloc_vmcs_cpu(raw_smp_processor_id());
1299 }
1300
1301 static void free_vmcs(struct vmcs *vmcs)
1302 {
1303         free_pages((unsigned long)vmcs, vmcs_config.order);
1304 }
1305
1306 static void free_kvm_area(void)
1307 {
1308         int cpu;
1309
1310         for_each_online_cpu(cpu)
1311                 free_vmcs(per_cpu(vmxarea, cpu));
1312 }
1313
1314 static __init int alloc_kvm_area(void)
1315 {
1316         int cpu;
1317
1318         for_each_online_cpu(cpu) {
1319                 struct vmcs *vmcs;
1320
1321                 vmcs = alloc_vmcs_cpu(cpu);
1322                 if (!vmcs) {
1323                         free_kvm_area();
1324                         return -ENOMEM;
1325                 }
1326
1327                 per_cpu(vmxarea, cpu) = vmcs;
1328         }
1329         return 0;
1330 }
1331
1332 static __init int hardware_setup(void)
1333 {
1334         if (setup_vmcs_config(&vmcs_config) < 0)
1335                 return -EIO;
1336
1337         if (boot_cpu_has(X86_FEATURE_NX))
1338                 kvm_enable_efer_bits(EFER_NX);
1339
1340         if (!cpu_has_vmx_vpid())
1341                 enable_vpid = 0;
1342
1343         if (!cpu_has_vmx_ept())
1344                 enable_ept = 0;
1345
1346         if (!cpu_has_vmx_flexpriority())
1347                 flexpriority_enabled = 0;
1348
1349         if (!cpu_has_vmx_tpr_shadow())
1350                 kvm_x86_ops->update_cr8_intercept = NULL;
1351
1352         return alloc_kvm_area();
1353 }
1354
1355 static __exit void hardware_unsetup(void)
1356 {
1357         free_kvm_area();
1358 }
1359
1360 static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1361 {
1362         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1363
1364         if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
1365                 vmcs_write16(sf->selector, save->selector);
1366                 vmcs_writel(sf->base, save->base);
1367                 vmcs_write32(sf->limit, save->limit);
1368                 vmcs_write32(sf->ar_bytes, save->ar);
1369         } else {
1370                 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
1371                         << AR_DPL_SHIFT;
1372                 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
1373         }
1374 }
1375
1376 static void enter_pmode(struct kvm_vcpu *vcpu)
1377 {
1378         unsigned long flags;
1379         struct vcpu_vmx *vmx = to_vmx(vcpu);
1380
1381         vmx->emulation_required = 1;
1382         vcpu->arch.rmode.vm86_active = 0;
1383
1384         vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
1385         vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
1386         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar);
1387
1388         flags = vmcs_readl(GUEST_RFLAGS);
1389         flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
1390         flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT);
1391         vmcs_writel(GUEST_RFLAGS, flags);
1392
1393         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
1394                         (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
1395
1396         update_exception_bitmap(vcpu);
1397
1398         if (emulate_invalid_guest_state)
1399                 return;
1400
1401         fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1402         fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1403         fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1404         fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1405
1406         vmcs_write16(GUEST_SS_SELECTOR, 0);
1407         vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1408
1409         vmcs_write16(GUEST_CS_SELECTOR,
1410                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
1411         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1412 }
1413
1414 static gva_t rmode_tss_base(struct kvm *kvm)
1415 {
1416         if (!kvm->arch.tss_addr) {
1417                 gfn_t base_gfn = kvm->memslots[0].base_gfn +
1418                                  kvm->memslots[0].npages - 3;
1419                 return base_gfn << PAGE_SHIFT;
1420         }
1421         return kvm->arch.tss_addr;
1422 }
1423
1424 static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1425 {
1426         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1427
1428         save->selector = vmcs_read16(sf->selector);
1429         save->base = vmcs_readl(sf->base);
1430         save->limit = vmcs_read32(sf->limit);
1431         save->ar = vmcs_read32(sf->ar_bytes);
1432         vmcs_write16(sf->selector, save->base >> 4);
1433         vmcs_write32(sf->base, save->base & 0xfffff);
1434         vmcs_write32(sf->limit, 0xffff);
1435         vmcs_write32(sf->ar_bytes, 0xf3);
1436 }
1437
1438 static void enter_rmode(struct kvm_vcpu *vcpu)
1439 {
1440         unsigned long flags;
1441         struct vcpu_vmx *vmx = to_vmx(vcpu);
1442
1443         vmx->emulation_required = 1;
1444         vcpu->arch.rmode.vm86_active = 1;
1445
1446         vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1447         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1448
1449         vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
1450         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
1451
1452         vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
1453         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1454
1455         flags = vmcs_readl(GUEST_RFLAGS);
1456         vcpu->arch.rmode.save_iopl
1457                 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1458
1459         flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1460
1461         vmcs_writel(GUEST_RFLAGS, flags);
1462         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1463         update_exception_bitmap(vcpu);
1464
1465         if (emulate_invalid_guest_state)
1466                 goto continue_rmode;
1467
1468         vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1469         vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1470         vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
1471
1472         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
1473         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1474         if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
1475                 vmcs_writel(GUEST_CS_BASE, 0xf0000);
1476         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
1477
1478         fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1479         fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1480         fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1481         fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1482
1483 continue_rmode:
1484         kvm_mmu_reset_context(vcpu);
1485         init_rmode(vcpu->kvm);
1486 }
1487
1488 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1489 {
1490         struct vcpu_vmx *vmx = to_vmx(vcpu);
1491         struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1492
1493         vcpu->arch.shadow_efer = efer;
1494         if (!msr)
1495                 return;
1496         if (efer & EFER_LMA) {
1497                 vmcs_write32(VM_ENTRY_CONTROLS,
1498                              vmcs_read32(VM_ENTRY_CONTROLS) |
1499                              VM_ENTRY_IA32E_MODE);
1500                 msr->data = efer;
1501         } else {
1502                 vmcs_write32(VM_ENTRY_CONTROLS,
1503                              vmcs_read32(VM_ENTRY_CONTROLS) &
1504                              ~VM_ENTRY_IA32E_MODE);
1505
1506                 msr->data = efer & ~EFER_LME;
1507         }
1508         setup_msrs(vmx);
1509 }
1510
1511 #ifdef CONFIG_X86_64
1512
1513 static void enter_lmode(struct kvm_vcpu *vcpu)
1514 {
1515         u32 guest_tr_ar;
1516
1517         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1518         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1519                 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
1520                        __func__);
1521                 vmcs_write32(GUEST_TR_AR_BYTES,
1522                              (guest_tr_ar & ~AR_TYPE_MASK)
1523                              | AR_TYPE_BUSY_64_TSS);
1524         }
1525         vcpu->arch.shadow_efer |= EFER_LMA;
1526         vmx_set_efer(vcpu, vcpu->arch.shadow_efer);
1527 }
1528
1529 static void exit_lmode(struct kvm_vcpu *vcpu)
1530 {
1531         vcpu->arch.shadow_efer &= ~EFER_LMA;
1532
1533         vmcs_write32(VM_ENTRY_CONTROLS,
1534                      vmcs_read32(VM_ENTRY_CONTROLS)
1535                      & ~VM_ENTRY_IA32E_MODE);
1536 }
1537
1538 #endif
1539
1540 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1541 {
1542         vpid_sync_vcpu_all(to_vmx(vcpu));
1543         if (enable_ept)
1544                 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1545 }
1546
1547 static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1548 {
1549         vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
1550         vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
1551 }
1552
1553 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1554 {
1555         if (!test_bit(VCPU_EXREG_PDPTR,
1556                       (unsigned long *)&vcpu->arch.regs_dirty))
1557                 return;
1558
1559         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1560                 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]);
1561                 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]);
1562                 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]);
1563                 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]);
1564         }
1565 }
1566
1567 static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1568 {
1569         if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1570                 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1571                 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1572                 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1573                 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1574         }
1575
1576         __set_bit(VCPU_EXREG_PDPTR,
1577                   (unsigned long *)&vcpu->arch.regs_avail);
1578         __set_bit(VCPU_EXREG_PDPTR,
1579                   (unsigned long *)&vcpu->arch.regs_dirty);
1580 }
1581
1582 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
1583
1584 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1585                                         unsigned long cr0,
1586                                         struct kvm_vcpu *vcpu)
1587 {
1588         if (!(cr0 & X86_CR0_PG)) {
1589                 /* From paging/starting to nonpaging */
1590                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1591                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1592                              (CPU_BASED_CR3_LOAD_EXITING |
1593                               CPU_BASED_CR3_STORE_EXITING));
1594                 vcpu->arch.cr0 = cr0;
1595                 vmx_set_cr4(vcpu, vcpu->arch.cr4);
1596                 *hw_cr0 |= X86_CR0_PE | X86_CR0_PG;
1597                 *hw_cr0 &= ~X86_CR0_WP;
1598         } else if (!is_paging(vcpu)) {
1599                 /* From nonpaging to paging */
1600                 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1601                              vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1602                              ~(CPU_BASED_CR3_LOAD_EXITING |
1603                                CPU_BASED_CR3_STORE_EXITING));
1604                 vcpu->arch.cr0 = cr0;
1605                 vmx_set_cr4(vcpu, vcpu->arch.cr4);
1606                 if (!(vcpu->arch.cr0 & X86_CR0_WP))
1607                         *hw_cr0 &= ~X86_CR0_WP;
1608         }
1609 }
1610
1611 static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1612                                         struct kvm_vcpu *vcpu)
1613 {
1614         if (!is_paging(vcpu)) {
1615                 *hw_cr4 &= ~X86_CR4_PAE;
1616                 *hw_cr4 |= X86_CR4_PSE;
1617         } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1618                 *hw_cr4 &= ~X86_CR4_PAE;
1619 }
1620
1621 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1622 {
1623         unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) |
1624                                 KVM_VM_CR0_ALWAYS_ON;
1625
1626         vmx_fpu_deactivate(vcpu);
1627
1628         if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
1629                 enter_pmode(vcpu);
1630
1631         if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
1632                 enter_rmode(vcpu);
1633
1634 #ifdef CONFIG_X86_64
1635         if (vcpu->arch.shadow_efer & EFER_LME) {
1636                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1637                         enter_lmode(vcpu);
1638                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
1639                         exit_lmode(vcpu);
1640         }
1641 #endif
1642
1643         if (enable_ept)
1644                 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1645
1646         vmcs_writel(CR0_READ_SHADOW, cr0);
1647         vmcs_writel(GUEST_CR0, hw_cr0);
1648         vcpu->arch.cr0 = cr0;
1649
1650         if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1651                 vmx_fpu_activate(vcpu);
1652 }
1653
1654 static u64 construct_eptp(unsigned long root_hpa)
1655 {
1656         u64 eptp;
1657
1658         /* TODO write the value reading from MSR */
1659         eptp = VMX_EPT_DEFAULT_MT |
1660                 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
1661         eptp |= (root_hpa & PAGE_MASK);
1662
1663         return eptp;
1664 }
1665
1666 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1667 {
1668         unsigned long guest_cr3;
1669         u64 eptp;
1670
1671         guest_cr3 = cr3;
1672         if (enable_ept) {
1673                 eptp = construct_eptp(cr3);
1674                 vmcs_write64(EPT_POINTER, eptp);
1675                 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1676                         VMX_EPT_IDENTITY_PAGETABLE_ADDR;
1677         }
1678
1679         vmx_flush_tlb(vcpu);
1680         vmcs_writel(GUEST_CR3, guest_cr3);
1681         if (vcpu->arch.cr0 & X86_CR0_PE)
1682                 vmx_fpu_deactivate(vcpu);
1683 }
1684
1685 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1686 {
1687         unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
1688                     KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1689
1690         vcpu->arch.cr4 = cr4;
1691         if (enable_ept)
1692                 ept_update_paging_mode_cr4(&hw_cr4, vcpu);
1693
1694         vmcs_writel(CR4_READ_SHADOW, cr4);
1695         vmcs_writel(GUEST_CR4, hw_cr4);
1696 }
1697
1698 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1699 {
1700         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1701
1702         return vmcs_readl(sf->base);
1703 }
1704
1705 static void vmx_get_segment(struct kvm_vcpu *vcpu,
1706                             struct kvm_segment *var, int seg)
1707 {
1708         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1709         u32 ar;
1710
1711         var->base = vmcs_readl(sf->base);
1712         var->limit = vmcs_read32(sf->limit);
1713         var->selector = vmcs_read16(sf->selector);
1714         ar = vmcs_read32(sf->ar_bytes);
1715         if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
1716                 ar = 0;
1717         var->type = ar & 15;
1718         var->s = (ar >> 4) & 1;
1719         var->dpl = (ar >> 5) & 3;
1720         var->present = (ar >> 7) & 1;
1721         var->avl = (ar >> 12) & 1;
1722         var->l = (ar >> 13) & 1;
1723         var->db = (ar >> 14) & 1;
1724         var->g = (ar >> 15) & 1;
1725         var->unusable = (ar >> 16) & 1;
1726 }
1727
1728 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1729 {
1730         struct kvm_segment kvm_seg;
1731
1732         if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
1733                 return 0;
1734
1735         if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
1736                 return 3;
1737
1738         vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
1739         return kvm_seg.selector & 3;
1740 }
1741
1742 static u32 vmx_segment_access_rights(struct kvm_segment *var)
1743 {
1744         u32 ar;
1745
1746         if (var->unusable)
1747                 ar = 1 << 16;
1748         else {
1749                 ar = var->type & 15;
1750                 ar |= (var->s & 1) << 4;
1751                 ar |= (var->dpl & 3) << 5;
1752                 ar |= (var->present & 1) << 7;
1753                 ar |= (var->avl & 1) << 12;
1754                 ar |= (var->l & 1) << 13;
1755                 ar |= (var->db & 1) << 14;
1756                 ar |= (var->g & 1) << 15;
1757         }
1758         if (ar == 0) /* a 0 value means unusable */
1759                 ar = AR_UNUSABLE_MASK;
1760
1761         return ar;
1762 }
1763
1764 static void vmx_set_segment(struct kvm_vcpu *vcpu,
1765                             struct kvm_segment *var, int seg)
1766 {
1767         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1768         u32 ar;
1769
1770         if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
1771                 vcpu->arch.rmode.tr.selector = var->selector;
1772                 vcpu->arch.rmode.tr.base = var->base;
1773                 vcpu->arch.rmode.tr.limit = var->limit;
1774                 vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var);
1775                 return;
1776         }
1777         vmcs_writel(sf->base, var->base);
1778         vmcs_write32(sf->limit, var->limit);
1779         vmcs_write16(sf->selector, var->selector);
1780         if (vcpu->arch.rmode.vm86_active && var->s) {
1781                 /*
1782                  * Hack real-mode segments into vm86 compatibility.
1783                  */
1784                 if (var->base == 0xffff0000 && var->selector == 0xf000)
1785                         vmcs_writel(sf->base, 0xf0000);
1786                 ar = 0xf3;
1787         } else
1788                 ar = vmx_segment_access_rights(var);
1789         vmcs_write32(sf->ar_bytes, ar);
1790 }
1791
1792 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
1793 {
1794         u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
1795
1796         *db = (ar >> 14) & 1;
1797         *l = (ar >> 13) & 1;
1798 }
1799
1800 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1801 {
1802         dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
1803         dt->base = vmcs_readl(GUEST_IDTR_BASE);
1804 }
1805
1806 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1807 {
1808         vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
1809         vmcs_writel(GUEST_IDTR_BASE, dt->base);
1810 }
1811
1812 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1813 {
1814         dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
1815         dt->base = vmcs_readl(GUEST_GDTR_BASE);
1816 }
1817
1818 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1819 {
1820         vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
1821         vmcs_writel(GUEST_GDTR_BASE, dt->base);
1822 }
1823
1824 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1825 {
1826         struct kvm_segment var;
1827         u32 ar;
1828
1829         vmx_get_segment(vcpu, &var, seg);
1830         ar = vmx_segment_access_rights(&var);
1831
1832         if (var.base != (var.selector << 4))
1833                 return false;
1834         if (var.limit != 0xffff)
1835                 return false;
1836         if (ar != 0xf3)
1837                 return false;
1838
1839         return true;
1840 }
1841
1842 static bool code_segment_valid(struct kvm_vcpu *vcpu)
1843 {
1844         struct kvm_segment cs;
1845         unsigned int cs_rpl;
1846
1847         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1848         cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1849
1850         if (cs.unusable)
1851                 return false;
1852         if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1853                 return false;
1854         if (!cs.s)
1855                 return false;
1856         if (cs.type & AR_TYPE_WRITEABLE_MASK) {
1857                 if (cs.dpl > cs_rpl)
1858                         return false;
1859         } else {
1860                 if (cs.dpl != cs_rpl)
1861                         return false;
1862         }
1863         if (!cs.present)
1864                 return false;
1865
1866         /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1867         return true;
1868 }
1869
1870 static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1871 {
1872         struct kvm_segment ss;
1873         unsigned int ss_rpl;
1874
1875         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1876         ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1877
1878         if (ss.unusable)
1879                 return true;
1880         if (ss.type != 3 && ss.type != 7)
1881                 return false;
1882         if (!ss.s)
1883                 return false;
1884         if (ss.dpl != ss_rpl) /* DPL != RPL */
1885                 return false;
1886         if (!ss.present)
1887                 return false;
1888
1889         return true;
1890 }
1891
1892 static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1893 {
1894         struct kvm_segment var;
1895         unsigned int rpl;
1896
1897         vmx_get_segment(vcpu, &var, seg);
1898         rpl = var.selector & SELECTOR_RPL_MASK;
1899
1900         if (var.unusable)
1901                 return true;
1902         if (!var.s)
1903                 return false;
1904         if (!var.present)
1905                 return false;
1906         if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1907                 if (var.dpl < rpl) /* DPL < RPL */
1908                         return false;
1909         }
1910
1911         /* TODO: Add other members to kvm_segment_field to allow checking for other access
1912          * rights flags
1913          */
1914         return true;
1915 }
1916
1917 static bool tr_valid(struct kvm_vcpu *vcpu)
1918 {
1919         struct kvm_segment tr;
1920
1921         vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1922
1923         if (tr.unusable)
1924                 return false;
1925         if (tr.selector & SELECTOR_TI_MASK)     /* TI = 1 */
1926                 return false;
1927         if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
1928                 return false;
1929         if (!tr.present)
1930                 return false;
1931
1932         return true;
1933 }
1934
1935 static bool ldtr_valid(struct kvm_vcpu *vcpu)
1936 {
1937         struct kvm_segment ldtr;
1938
1939         vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1940
1941         if (ldtr.unusable)
1942                 return true;
1943         if (ldtr.selector & SELECTOR_TI_MASK)   /* TI = 1 */
1944                 return false;
1945         if (ldtr.type != 2)
1946                 return false;
1947         if (!ldtr.present)
1948                 return false;
1949
1950         return true;
1951 }
1952
1953 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1954 {
1955         struct kvm_segment cs, ss;
1956
1957         vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1958         vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1959
1960         return ((cs.selector & SELECTOR_RPL_MASK) ==
1961                  (ss.selector & SELECTOR_RPL_MASK));
1962 }
1963
1964 /*
1965  * Check if guest state is valid. Returns true if valid, false if
1966  * not.
1967  * We assume that registers are always usable
1968  */
1969 static bool guest_state_valid(struct kvm_vcpu *vcpu)
1970 {
1971         /* real mode guest state checks */
1972         if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1973                 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1974                         return false;
1975                 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1976                         return false;
1977                 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1978                         return false;
1979                 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1980                         return false;
1981                 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1982                         return false;
1983                 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1984                         return false;
1985         } else {
1986         /* protected mode guest state checks */
1987                 if (!cs_ss_rpl_check(vcpu))
1988                         return false;
1989                 if (!code_segment_valid(vcpu))
1990                         return false;
1991                 if (!stack_segment_valid(vcpu))
1992                         return false;
1993                 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1994                         return false;
1995                 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1996                         return false;
1997                 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1998                         return false;
1999                 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
2000                         return false;
2001                 if (!tr_valid(vcpu))
2002                         return false;
2003                 if (!ldtr_valid(vcpu))
2004                         return false;
2005         }
2006         /* TODO:
2007          * - Add checks on RIP
2008          * - Add checks on RFLAGS
2009          */
2010
2011         return true;
2012 }
2013
2014 static int init_rmode_tss(struct kvm *kvm)
2015 {
2016         gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2017         u16 data = 0;
2018         int ret = 0;
2019         int r;
2020
2021         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2022         if (r < 0)
2023                 goto out;
2024         data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
2025         r = kvm_write_guest_page(kvm, fn++, &data,
2026                         TSS_IOPB_BASE_OFFSET, sizeof(u16));
2027         if (r < 0)
2028                 goto out;
2029         r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
2030         if (r < 0)
2031                 goto out;
2032         r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2033         if (r < 0)
2034                 goto out;
2035         data = ~0;
2036         r = kvm_write_guest_page(kvm, fn, &data,
2037                                  RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
2038                                  sizeof(u8));
2039         if (r < 0)
2040                 goto out;
2041
2042         ret = 1;
2043 out:
2044         return ret;
2045 }
2046
2047 static int init_rmode_identity_map(struct kvm *kvm)
2048 {
2049         int i, r, ret;
2050         pfn_t identity_map_pfn;
2051         u32 tmp;
2052
2053         if (!enable_ept)
2054                 return 1;
2055         if (unlikely(!kvm->arch.ept_identity_pagetable)) {
2056                 printk(KERN_ERR "EPT: identity-mapping pagetable "
2057                         "haven't been allocated!\n");
2058                 return 0;
2059         }
2060         if (likely(kvm->arch.ept_identity_pagetable_done))
2061                 return 1;
2062         ret = 0;
2063         identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT;
2064         r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2065         if (r < 0)
2066                 goto out;
2067         /* Set up identity-mapping pagetable for EPT in real mode */
2068         for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
2069                 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
2070                         _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
2071                 r = kvm_write_guest_page(kvm, identity_map_pfn,
2072                                 &tmp, i * sizeof(tmp), sizeof(tmp));
2073                 if (r < 0)
2074                         goto out;
2075         }
2076         kvm->arch.ept_identity_pagetable_done = true;
2077         ret = 1;
2078 out:
2079         return ret;
2080 }
2081
2082 static void seg_setup(int seg)
2083 {
2084         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2085
2086         vmcs_write16(sf->selector, 0);
2087         vmcs_writel(sf->base, 0);
2088         vmcs_write32(sf->limit, 0xffff);
2089         vmcs_write32(sf->ar_bytes, 0xf3);
2090 }
2091
2092 static int alloc_apic_access_page(struct kvm *kvm)
2093 {
2094         struct kvm_userspace_memory_region kvm_userspace_mem;
2095         int r = 0;
2096
2097         down_write(&kvm->slots_lock);
2098         if (kvm->arch.apic_access_page)
2099                 goto out;
2100         kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
2101         kvm_userspace_mem.flags = 0;
2102         kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
2103         kvm_userspace_mem.memory_size = PAGE_SIZE;
2104         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2105         if (r)
2106                 goto out;
2107
2108         kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2109 out:
2110         up_write(&kvm->slots_lock);
2111         return r;
2112 }
2113
2114 static int alloc_identity_pagetable(struct kvm *kvm)
2115 {
2116         struct kvm_userspace_memory_region kvm_userspace_mem;
2117         int r = 0;
2118
2119         down_write(&kvm->slots_lock);
2120         if (kvm->arch.ept_identity_pagetable)
2121                 goto out;
2122         kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
2123         kvm_userspace_mem.flags = 0;
2124         kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
2125         kvm_userspace_mem.memory_size = PAGE_SIZE;
2126         r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0);
2127         if (r)
2128                 goto out;
2129
2130         kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2131                         VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
2132 out:
2133         up_write(&kvm->slots_lock);
2134         return r;
2135 }
2136
2137 static void allocate_vpid(struct vcpu_vmx *vmx)
2138 {
2139         int vpid;
2140
2141         vmx->vpid = 0;
2142         if (!enable_vpid)
2143                 return;
2144         spin_lock(&vmx_vpid_lock);
2145         vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
2146         if (vpid < VMX_NR_VPIDS) {
2147                 vmx->vpid = vpid;
2148                 __set_bit(vpid, vmx_vpid_bitmap);
2149         }
2150         spin_unlock(&vmx_vpid_lock);
2151 }
2152
2153 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
2154 {
2155         int f = sizeof(unsigned long);
2156
2157         if (!cpu_has_vmx_msr_bitmap())
2158                 return;
2159
2160         /*
2161          * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
2162          * have the write-low and read-high bitmap offsets the wrong way round.
2163          * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
2164          */
2165         if (msr <= 0x1fff) {
2166                 __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
2167                 __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
2168         } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
2169                 msr &= 0x1fff;
2170                 __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
2171                 __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
2172         }
2173 }
2174
2175 static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
2176 {
2177         if (!longmode_only)
2178                 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
2179         __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
2180 }
2181
2182 /*
2183  * Sets up the vmcs for emulated real mode.
2184  */
2185 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2186 {
2187         u32 host_sysenter_cs, msr_low, msr_high;
2188         u32 junk;
2189         u64 host_pat, tsc_this, tsc_base;
2190         unsigned long a;
2191         struct descriptor_table dt;
2192         int i;
2193         unsigned long kvm_vmx_return;
2194         u32 exec_control;
2195
2196         /* I/O */
2197         vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
2198         vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
2199
2200         if (cpu_has_vmx_msr_bitmap())
2201                 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
2202
2203         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
2204
2205         /* Control */
2206         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
2207                 vmcs_config.pin_based_exec_ctrl);
2208
2209         exec_control = vmcs_config.cpu_based_exec_ctrl;
2210         if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
2211                 exec_control &= ~CPU_BASED_TPR_SHADOW;
2212 #ifdef CONFIG_X86_64
2213                 exec_control |= CPU_BASED_CR8_STORE_EXITING |
2214                                 CPU_BASED_CR8_LOAD_EXITING;
2215 #endif
2216         }
2217         if (!enable_ept)
2218                 exec_control |= CPU_BASED_CR3_STORE_EXITING |
2219                                 CPU_BASED_CR3_LOAD_EXITING  |
2220                                 CPU_BASED_INVLPG_EXITING;
2221         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2222
2223         if (cpu_has_secondary_exec_ctrls()) {
2224                 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
2225                 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2226                         exec_control &=
2227                                 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2228                 if (vmx->vpid == 0)
2229                         exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2230                 if (!enable_ept)
2231                         exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2232                 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2233         }
2234
2235         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2236         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2237         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
2238
2239         vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
2240         vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
2241         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
2242
2243         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
2244         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2245         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2246         vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs());    /* 22.2.4 */
2247         vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs());    /* 22.2.4 */
2248         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
2249 #ifdef CONFIG_X86_64
2250         rdmsrl(MSR_FS_BASE, a);
2251         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
2252         rdmsrl(MSR_GS_BASE, a);
2253         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
2254 #else
2255         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
2256         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
2257 #endif
2258
2259         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
2260
2261         kvm_get_idt(&dt);
2262         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
2263
2264         asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
2265         vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
2266         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
2267         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2268         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2269
2270         rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
2271         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
2272         rdmsrl(MSR_IA32_SYSENTER_ESP, a);
2273         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
2274         rdmsrl(MSR_IA32_SYSENTER_EIP, a);
2275         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
2276
2277         if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
2278                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2279                 host_pat = msr_low | ((u64) msr_high << 32);
2280                 vmcs_write64(HOST_IA32_PAT, host_pat);
2281         }
2282         if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2283                 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
2284                 host_pat = msr_low | ((u64) msr_high << 32);
2285                 /* Write the default value follow host pat */
2286                 vmcs_write64(GUEST_IA32_PAT, host_pat);
2287                 /* Keep arch.pat sync with GUEST_IA32_PAT */
2288                 vmx->vcpu.arch.pat = host_pat;
2289         }
2290
2291         for (i = 0; i < NR_VMX_MSR; ++i) {
2292                 u32 index = vmx_msr_index[i];
2293                 u32 data_low, data_high;
2294                 u64 data;
2295                 int j = vmx->nmsrs;
2296
2297                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
2298                         continue;
2299                 if (wrmsr_safe(index, data_low, data_high) < 0)
2300                         continue;
2301                 data = data_low | ((u64)data_high << 32);
2302                 vmx->host_msrs[j].index = index;
2303                 vmx->host_msrs[j].reserved = 0;
2304                 vmx->host_msrs[j].data = data;
2305                 vmx->guest_msrs[j] = vmx->host_msrs[j];
2306                 ++vmx->nmsrs;
2307         }
2308
2309         vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
2310
2311         /* 22.2.1, 20.8.1 */
2312         vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2313
2314         vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2315         vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
2316
2317         tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2318         rdtscll(tsc_this);
2319         if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2320                 tsc_base = tsc_this;
2321
2322         guest_write_tsc(0, tsc_base);
2323
2324         return 0;
2325 }
2326
2327 static int init_rmode(struct kvm *kvm)
2328 {
2329         if (!init_rmode_tss(kvm))
2330                 return 0;
2331         if (!init_rmode_identity_map(kvm))
2332                 return 0;
2333         return 1;
2334 }
2335
2336 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2337 {
2338         struct vcpu_vmx *vmx = to_vmx(vcpu);
2339         u64 msr;
2340         int ret;
2341
2342         vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2343         down_read(&vcpu->kvm->slots_lock);
2344         if (!init_rmode(vmx->vcpu.kvm)) {
2345                 ret = -ENOMEM;
2346                 goto out;
2347         }
2348
2349         vmx->vcpu.arch.rmode.vm86_active = 0;
2350
2351         vmx->soft_vnmi_blocked = 0;
2352
2353         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
2354         kvm_set_cr8(&vmx->vcpu, 0);
2355         msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
2356         if (vmx->vcpu.vcpu_id == 0)
2357                 msr |= MSR_IA32_APICBASE_BSP;
2358         kvm_set_apic_base(&vmx->vcpu, msr);
2359
2360         fx_init(&vmx->vcpu);
2361
2362         seg_setup(VCPU_SREG_CS);
2363         /*
2364          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2365          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
2366          */
2367         if (vmx->vcpu.vcpu_id == 0) {
2368                 vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
2369                 vmcs_writel(GUEST_CS_BASE, 0x000f0000);
2370         } else {
2371                 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2372                 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2373         }
2374
2375         seg_setup(VCPU_SREG_DS);
2376         seg_setup(VCPU_SREG_ES);
2377         seg_setup(VCPU_SREG_FS);
2378         seg_setup(VCPU_SREG_GS);
2379         seg_setup(VCPU_SREG_SS);
2380
2381         vmcs_write16(GUEST_TR_SELECTOR, 0);
2382         vmcs_writel(GUEST_TR_BASE, 0);
2383         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
2384         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
2385
2386         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
2387         vmcs_writel(GUEST_LDTR_BASE, 0);
2388         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
2389         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
2390
2391         vmcs_write32(GUEST_SYSENTER_CS, 0);
2392         vmcs_writel(GUEST_SYSENTER_ESP, 0);
2393         vmcs_writel(GUEST_SYSENTER_EIP, 0);
2394
2395         vmcs_writel(GUEST_RFLAGS, 0x02);
2396         if (vmx->vcpu.vcpu_id == 0)
2397                 kvm_rip_write(vcpu, 0xfff0);
2398         else
2399                 kvm_rip_write(vcpu, 0);
2400         kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2401
2402         vmcs_writel(GUEST_DR7, 0x400);
2403
2404         vmcs_writel(GUEST_GDTR_BASE, 0);
2405         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
2406
2407         vmcs_writel(GUEST_IDTR_BASE, 0);
2408         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2409
2410         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
2411         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2412         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2413
2414         /* Special registers */
2415         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
2416
2417         setup_msrs(vmx);
2418
2419         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
2420
2421         if (cpu_has_vmx_tpr_shadow()) {
2422                 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2423                 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2424                         vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2425                                 page_to_phys(vmx->vcpu.arch.apic->regs_page));
2426                 vmcs_write32(TPR_THRESHOLD, 0);
2427         }
2428
2429         if (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
2430                 vmcs_write64(APIC_ACCESS_ADDR,
2431                              page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
2432
2433         if (vmx->vpid != 0)
2434                 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2435
2436         vmx->vcpu.arch.cr0 = 0x60000010;
2437         vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
2438         vmx_set_cr4(&vmx->vcpu, 0);
2439         vmx_set_efer(&vmx->vcpu, 0);
2440         vmx_fpu_activate(&vmx->vcpu);
2441         update_exception_bitmap(&vmx->vcpu);
2442
2443         vpid_sync_vcpu_all(vmx);
2444
2445         ret = 0;
2446
2447         /* HACK: Don't enable emulation on guest boot/reset */
2448         vmx->emulation_required = 0;
2449
2450 out:
2451         up_read(&vcpu->kvm->slots_lock);
2452         return ret;
2453 }
2454
2455 static void enable_irq_window(struct kvm_vcpu *vcpu)
2456 {
2457         u32 cpu_based_vm_exec_control;
2458
2459         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2460         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
2461         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2462 }
2463
2464 static void enable_nmi_window(struct kvm_vcpu *vcpu)
2465 {
2466         u32 cpu_based_vm_exec_control;
2467
2468         if (!cpu_has_virtual_nmis()) {
2469                 enable_irq_window(vcpu);
2470                 return;
2471         }
2472
2473         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2474         cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2475         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2476 }
2477
2478 static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2479 {
2480         struct vcpu_vmx *vmx = to_vmx(vcpu);
2481         uint32_t intr;
2482         int irq = vcpu->arch.interrupt.nr;
2483
2484         KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2485
2486         ++vcpu->stat.irq_injections;
2487         if (vcpu->arch.rmode.vm86_active) {
2488                 vmx->rmode.irq.pending = true;
2489                 vmx->rmode.irq.vector = irq;
2490                 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2491                 if (vcpu->arch.interrupt.soft)
2492                         vmx->rmode.irq.rip +=
2493                                 vmx->vcpu.arch.event_exit_inst_len;
2494                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2495                              irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2496                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2497                 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2498                 return;
2499         }
2500         intr = irq | INTR_INFO_VALID_MASK;
2501         if (vcpu->arch.interrupt.soft) {
2502                 intr |= INTR_TYPE_SOFT_INTR;
2503                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2504                              vmx->vcpu.arch.event_exit_inst_len);
2505         } else
2506                 intr |= INTR_TYPE_EXT_INTR;
2507         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
2508 }
2509
2510 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2511 {
2512         struct vcpu_vmx *vmx = to_vmx(vcpu);
2513
2514         if (!cpu_has_virtual_nmis()) {
2515                 /*
2516                  * Tracking the NMI-blocked state in software is built upon
2517                  * finding the next open IRQ window. This, in turn, depends on
2518                  * well-behaving guests: They have to keep IRQs disabled at
2519                  * least as long as the NMI handler runs. Otherwise we may
2520                  * cause NMI nesting, maybe breaking the guest. But as this is
2521                  * highly unlikely, we can live with the residual risk.
2522                  */
2523                 vmx->soft_vnmi_blocked = 1;
2524                 vmx->vnmi_blocked_time = 0;
2525         }
2526
2527         ++vcpu->stat.nmi_injections;
2528         if (vcpu->arch.rmode.vm86_active) {
2529                 vmx->rmode.irq.pending = true;
2530                 vmx->rmode.irq.vector = NMI_VECTOR;
2531                 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2532                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2533                              NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2534                              INTR_INFO_VALID_MASK);
2535                 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2536                 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2537                 return;
2538         }
2539         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2540                         INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2541 }
2542
2543 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2544 {
2545         if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
2546                 return 0;
2547
2548         return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2549                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
2550                                 GUEST_INTR_STATE_NMI));
2551 }
2552
2553 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
2554 {
2555         return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
2556                 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2557                         (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
2558 }
2559
2560 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2561 {
2562         int ret;
2563         struct kvm_userspace_memory_region tss_mem = {
2564                 .slot = TSS_PRIVATE_MEMSLOT,
2565                 .guest_phys_addr = addr,
2566                 .memory_size = PAGE_SIZE * 3,
2567                 .flags = 0,
2568         };
2569
2570         ret = kvm_set_memory_region(kvm, &tss_mem, 0);
2571         if (ret)
2572                 return ret;
2573         kvm->arch.tss_addr = addr;
2574         return 0;
2575 }
2576
2577 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2578                                   int vec, u32 err_code)
2579 {
2580         /*
2581          * Instruction with address size override prefix opcode 0x67
2582          * Cause the #SS fault with 0 error code in VM86 mode.
2583          */
2584         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2585                 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2586                         return 1;
2587         /*
2588          * Forward all other exceptions that are valid in real mode.
2589          * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2590          *        the required debugging infrastructure rework.
2591          */
2592         switch (vec) {
2593         case DB_VECTOR:
2594                 if (vcpu->guest_debug &
2595                     (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
2596                         return 0;
2597                 kvm_queue_exception(vcpu, vec);
2598                 return 1;
2599         case BP_VECTOR:
2600                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2601                         return 0;
2602                 /* fall through */
2603         case DE_VECTOR:
2604         case OF_VECTOR:
2605         case BR_VECTOR:
2606         case UD_VECTOR:
2607         case DF_VECTOR:
2608         case SS_VECTOR:
2609         case GP_VECTOR:
2610         case MF_VECTOR:
2611                 kvm_queue_exception(vcpu, vec);
2612                 return 1;
2613         }
2614         return 0;
2615 }
2616
2617 /*
2618  * Trigger machine check on the host. We assume all the MSRs are already set up
2619  * by the CPU and that we still run on the same CPU as the MCE occurred on.
2620  * We pass a fake environment to the machine check handler because we want
2621  * the guest to be always treated like user space, no matter what context
2622  * it used internally.
2623  */
2624 static void kvm_machine_check(void)
2625 {
2626 #if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
2627         struct pt_regs regs = {
2628                 .cs = 3, /* Fake ring 3 no matter what the guest ran on */
2629                 .flags = X86_EFLAGS_IF,
2630         };
2631
2632         do_machine_check(&regs, 0);
2633 #endif
2634 }
2635
2636 static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2637 {
2638         /* already handled by vcpu_run */
2639         return 1;
2640 }
2641
2642 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2643 {
2644         struct vcpu_vmx *vmx = to_vmx(vcpu);
2645         u32 intr_info, ex_no, error_code;
2646         unsigned long cr2, rip, dr6;
2647         u32 vect_info;
2648         enum emulation_result er;
2649
2650         vect_info = vmx->idt_vectoring_info;
2651         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2652
2653         if (is_machine_check(intr_info))
2654                 return handle_machine_check(vcpu, kvm_run);
2655
2656         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2657                                                 !is_page_fault(intr_info))
2658                 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
2659                        "intr info 0x%x\n", __func__, vect_info, intr_info);
2660
2661         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2662                 return 1;  /* already handled by vmx_vcpu_run() */
2663
2664         if (is_no_device(intr_info)) {
2665                 vmx_fpu_activate(vcpu);
2666                 return 1;
2667         }
2668
2669         if (is_invalid_opcode(intr_info)) {
2670                 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
2671                 if (er != EMULATE_DONE)
2672                         kvm_queue_exception(vcpu, UD_VECTOR);
2673                 return 1;
2674         }
2675
2676         error_code = 0;
2677         rip = kvm_rip_read(vcpu);
2678         if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2679                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2680         if (is_page_fault(intr_info)) {
2681                 /* EPT won't cause page fault directly */
2682                 if (enable_ept)
2683                         BUG();
2684                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2685                 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2686                             (u32)((u64)cr2 >> 32), handler);
2687                 if (kvm_event_needs_reinjection(vcpu))
2688                         kvm_mmu_unprotect_page_virt(vcpu, cr2);
2689                 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2690         }
2691
2692         if (vcpu->arch.rmode.vm86_active &&
2693             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
2694                                                                 error_code)) {
2695                 if (vcpu->arch.halt_request) {
2696                         vcpu->arch.halt_request = 0;
2697                         return kvm_emulate_halt(vcpu);
2698                 }
2699                 return 1;
2700         }
2701
2702         ex_no = intr_info & INTR_INFO_VECTOR_MASK;
2703         switch (ex_no) {
2704         case DB_VECTOR:
2705                 dr6 = vmcs_readl(EXIT_QUALIFICATION);
2706                 if (!(vcpu->guest_debug &
2707                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
2708                         vcpu->arch.dr6 = dr6 | DR6_FIXED_1;
2709                         kvm_queue_exception(vcpu, DB_VECTOR);
2710                         return 1;
2711                 }
2712                 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
2713                 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2714                 /* fall through */
2715         case BP_VECTOR:
2716                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2717                 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2718                 kvm_run->debug.arch.exception = ex_no;
2719                 break;
2720         default:
2721                 kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
2722                 kvm_run->ex.exception = ex_no;
2723                 kvm_run->ex.error_code = error_code;
2724                 break;
2725         }
2726         return 0;
2727 }
2728
2729 static int handle_external_interrupt(struct kvm_vcpu *vcpu,
2730                                      struct kvm_run *kvm_run)
2731 {
2732         ++vcpu->stat.irq_exits;
2733         KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
2734         return 1;
2735 }
2736
2737 static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2738 {
2739         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2740         return 0;
2741 }
2742
2743 static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 {
2745         unsigned long exit_qualification;
2746         int size, in, string;
2747         unsigned port;
2748
2749         ++vcpu->stat.io_exits;
2750         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2751         string = (exit_qualification & 16) != 0;
2752
2753         if (string) {
2754                 if (emulate_instruction(vcpu,
2755                                         kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2756                         return 0;
2757                 return 1;
2758         }
2759
2760         size = (exit_qualification & 7) + 1;
2761         in = (exit_qualification & 8) != 0;
2762         port = exit_qualification >> 16;
2763
2764         skip_emulated_instruction(vcpu);
2765         return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
2766 }
2767
2768 static void
2769 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2770 {
2771         /*
2772          * Patch in the VMCALL instruction:
2773          */
2774         hypercall[0] = 0x0f;
2775         hypercall[1] = 0x01;
2776         hypercall[2] = 0xc1;
2777 }
2778
2779 static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2780 {
2781         unsigned long exit_qualification;
2782         int cr;
2783         int reg;
2784
2785         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2786         cr = exit_qualification & 15;
2787         reg = (exit_qualification >> 8) & 15;
2788         switch ((exit_qualification >> 4) & 3) {
2789         case 0: /* mov to cr */
2790                 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2791                             (u32)kvm_register_read(vcpu, reg),
2792                             (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2793                             handler);
2794                 switch (cr) {
2795                 case 0:
2796                         kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2797                         skip_emulated_instruction(vcpu);
2798                         return 1;
2799                 case 3:
2800                         kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2801                         skip_emulated_instruction(vcpu);
2802                         return 1;
2803                 case 4:
2804                         kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2805                         skip_emulated_instruction(vcpu);
2806                         return 1;
2807                 case 8: {
2808                                 u8 cr8_prev = kvm_get_cr8(vcpu);
2809                                 u8 cr8 = kvm_register_read(vcpu, reg);
2810                                 kvm_set_cr8(vcpu, cr8);
2811                                 skip_emulated_instruction(vcpu);
2812                                 if (irqchip_in_kernel(vcpu->kvm))
2813                                         return 1;
2814                                 if (cr8_prev <= cr8)
2815                                         return 1;
2816                                 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2817                                 return 0;
2818                         }
2819                 };
2820                 break;
2821         case 2: /* clts */
2822                 vmx_fpu_deactivate(vcpu);
2823                 vcpu->arch.cr0 &= ~X86_CR0_TS;
2824                 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2825                 vmx_fpu_activate(vcpu);
2826                 KVMTRACE_0D(CLTS, vcpu, handler);
2827                 skip_emulated_instruction(vcpu);
2828                 return 1;
2829         case 1: /*mov from cr*/
2830                 switch (cr) {
2831                 case 3:
2832                         kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2833                         KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2834                                     (u32)kvm_register_read(vcpu, reg),
2835                                     (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2836                                     handler);
2837                         skip_emulated_instruction(vcpu);
2838                         return 1;
2839                 case 8:
2840                         kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2841                         KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2842                                     (u32)kvm_register_read(vcpu, reg), handler);
2843                         skip_emulated_instruction(vcpu);
2844                         return 1;
2845                 }
2846                 break;
2847         case 3: /* lmsw */
2848                 kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
2849
2850                 skip_emulated_instruction(vcpu);
2851                 return 1;
2852         default:
2853                 break;
2854         }
2855         kvm_run->exit_reason = 0;
2856         pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2857                (int)(exit_qualification >> 4) & 3, cr);
2858         return 0;
2859 }
2860
2861 static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2862 {
2863         unsigned long exit_qualification;
2864         unsigned long val;
2865         int dr, reg;
2866
2867         dr = vmcs_readl(GUEST_DR7);
2868         if (dr & DR7_GD) {
2869                 /*
2870                  * As the vm-exit takes precedence over the debug trap, we
2871                  * need to emulate the latter, either for the host or the
2872                  * guest debugging itself.
2873                  */
2874                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2875                         kvm_run->debug.arch.dr6 = vcpu->arch.dr6;
2876                         kvm_run->debug.arch.dr7 = dr;
2877                         kvm_run->debug.arch.pc =
2878                                 vmcs_readl(GUEST_CS_BASE) +
2879                                 vmcs_readl(GUEST_RIP);
2880                         kvm_run->debug.arch.exception = DB_VECTOR;
2881                         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2882                         return 0;
2883                 } else {
2884                         vcpu->arch.dr7 &= ~DR7_GD;
2885                         vcpu->arch.dr6 |= DR6_BD;
2886                         vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2887                         kvm_queue_exception(vcpu, DB_VECTOR);
2888                         return 1;
2889                 }
2890         }
2891
2892         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2893         dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
2894         reg = DEBUG_REG_ACCESS_REG(exit_qualification);
2895         if (exit_qualification & TYPE_MOV_FROM_DR) {
2896                 switch (dr) {
2897                 case 0 ... 3:
2898                         val = vcpu->arch.db[dr];
2899                         break;
2900                 case 6:
2901                         val = vcpu->arch.dr6;
2902                         break;
2903                 case 7:
2904                         val = vcpu->arch.dr7;
2905                         break;
2906                 default:
2907                         val = 0;
2908                 }
2909                 kvm_register_write(vcpu, reg, val);
2910                 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2911         } else {
2912                 val = vcpu->arch.regs[reg];
2913                 switch (dr) {
2914                 case 0 ... 3:
2915                         vcpu->arch.db[dr] = val;
2916                         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
2917                                 vcpu->arch.eff_db[dr] = val;
2918                         break;
2919                 case 4 ... 5:
2920                         if (vcpu->arch.cr4 & X86_CR4_DE)
2921                                 kvm_queue_exception(vcpu, UD_VECTOR);
2922                         break;
2923                 case 6:
2924                         if (val & 0xffffffff00000000ULL) {
2925                                 kvm_queue_exception(vcpu, GP_VECTOR);
2926                                 break;
2927                         }
2928                         vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
2929                         break;
2930                 case 7:
2931                         if (val & 0xffffffff00000000ULL) {
2932                                 kvm_queue_exception(vcpu, GP_VECTOR);
2933                                 break;
2934                         }
2935                         vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
2936                         if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
2937                                 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
2938                                 vcpu->arch.switch_db_regs =
2939                                         (val & DR7_BP_EN_MASK);
2940                         }
2941                         break;
2942                 }
2943                 KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler);
2944         }
2945         skip_emulated_instruction(vcpu);
2946         return 1;
2947 }
2948
2949 static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2950 {
2951         kvm_emulate_cpuid(vcpu);
2952         return 1;
2953 }
2954
2955 static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2956 {
2957         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2958         u64 data;
2959
2960         if (vmx_get_msr(vcpu, ecx, &data)) {
2961                 kvm_inject_gp(vcpu, 0);
2962                 return 1;
2963         }
2964
2965         KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
2966                     handler);
2967
2968         /* FIXME: handling of bits 32:63 of rax, rdx */
2969         vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
2970         vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
2971         skip_emulated_instruction(vcpu);
2972         return 1;
2973 }
2974
2975 static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2976 {
2977         u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
2978         u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
2979                 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2980
2981         KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
2982                     handler);
2983
2984         if (vmx_set_msr(vcpu, ecx, data) != 0) {
2985                 kvm_inject_gp(vcpu, 0);
2986                 return 1;
2987         }
2988
2989         skip_emulated_instruction(vcpu);
2990         return 1;
2991 }
2992
2993 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu,
2994                                       struct kvm_run *kvm_run)
2995 {
2996         return 1;
2997 }
2998
2999 static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3000                                    struct kvm_run *kvm_run)
3001 {
3002         u32 cpu_based_vm_exec_control;
3003
3004         /* clear pending irq */
3005         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3006         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3007         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3008
3009         KVMTRACE_0D(PEND_INTR, vcpu, handler);
3010         ++vcpu->stat.irq_window_exits;
3011
3012         /*
3013          * If the user space waits to inject interrupts, exit as soon as
3014          * possible
3015          */
3016         if (!irqchip_in_kernel(vcpu->kvm) &&
3017             kvm_run->request_interrupt_window &&
3018             !kvm_cpu_has_interrupt(vcpu)) {
3019                 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3020                 return 0;
3021         }
3022         return 1;
3023 }
3024
3025 static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3026 {
3027         skip_emulated_instruction(vcpu);
3028         return kvm_emulate_halt(vcpu);
3029 }
3030
3031 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3032 {
3033         skip_emulated_instruction(vcpu);
3034         kvm_emulate_hypercall(vcpu);
3035         return 1;
3036 }
3037
3038 static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3039 {
3040         kvm_queue_exception(vcpu, UD_VECTOR);
3041         return 1;
3042 }
3043
3044 static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3045 {
3046         unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3047
3048         kvm_mmu_invlpg(vcpu, exit_qualification);
3049         skip_emulated_instruction(vcpu);
3050         return 1;
3051 }
3052
3053 static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3054 {
3055         skip_emulated_instruction(vcpu);
3056         /* TODO: Add support for VT-d/pass-through device */
3057         return 1;
3058 }
3059
3060 static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3061 {
3062         unsigned long exit_qualification;
3063         enum emulation_result er;
3064         unsigned long offset;
3065
3066         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3067         offset = exit_qualification & 0xffful;
3068
3069         er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
3070
3071         if (er !=  EMULATE_DONE) {
3072                 printk(KERN_ERR
3073                        "Fail to handle apic access vmexit! Offset is 0x%lx\n",
3074                        offset);
3075                 return -ENOTSUPP;
3076         }
3077         return 1;
3078 }
3079
3080 static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3081 {
3082         struct vcpu_vmx *vmx = to_vmx(vcpu);
3083         unsigned long exit_qualification;
3084         u16 tss_selector;
3085         int reason, type, idt_v;
3086
3087         idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
3088         type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
3089
3090         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3091
3092         reason = (u32)exit_qualification >> 30;
3093         if (reason == TASK_SWITCH_GATE && idt_v) {
3094                 switch (type) {
3095                 case INTR_TYPE_NMI_INTR:
3096                         vcpu->arch.nmi_injected = false;
3097                         if (cpu_has_virtual_nmis())
3098                                 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3099                                               GUEST_INTR_STATE_NMI);
3100                         break;
3101                 case INTR_TYPE_EXT_INTR:
3102                 case INTR_TYPE_SOFT_INTR:
3103                         kvm_clear_interrupt_queue(vcpu);
3104                         break;
3105                 case INTR_TYPE_HARD_EXCEPTION:
3106                 case INTR_TYPE_SOFT_EXCEPTION:
3107                         kvm_clear_exception_queue(vcpu);
3108                         break;
3109                 default:
3110                         break;
3111                 }
3112         }
3113         tss_selector = exit_qualification;
3114
3115         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
3116                        type != INTR_TYPE_EXT_INTR &&
3117                        type != INTR_TYPE_NMI_INTR))
3118                 skip_emulated_instruction(vcpu);
3119
3120         if (!kvm_task_switch(vcpu, tss_selector, reason))
3121                 return 0;
3122
3123         /* clear all local breakpoint enable flags */
3124         vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
3125
3126         /*
3127          * TODO: What about debug traps on tss switch?
3128          *       Are we supposed to inject them and update dr6?
3129          */
3130
3131         return 1;
3132 }
3133
3134 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3135 {
3136         unsigned long exit_qualification;
3137         gpa_t gpa;
3138         int gla_validity;
3139
3140         exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3141
3142         if (exit_qualification & (1 << 6)) {
3143                 printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
3144                 return -ENOTSUPP;
3145         }
3146
3147         gla_validity = (exit_qualification >> 7) & 0x3;
3148         if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
3149                 printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
3150                 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
3151                         (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
3152                         vmcs_readl(GUEST_LINEAR_ADDRESS));
3153                 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3154                         (long unsigned int)exit_qualification);
3155                 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3156                 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3157                 return 0;
3158         }
3159
3160         gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3161         return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
3162 }
3163
3164 static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3165 {
3166         u32 cpu_based_vm_exec_control;
3167
3168         /* clear pending NMI */
3169         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3170         cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3171         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3172         ++vcpu->stat.nmi_window_exits;
3173
3174         return 1;
3175 }
3176
3177 static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3178                                 struct kvm_run *kvm_run)
3179 {
3180         struct vcpu_vmx *vmx = to_vmx(vcpu);
3181         enum emulation_result err = EMULATE_DONE;
3182
3183         local_irq_enable();
3184         preempt_enable();
3185
3186         while (!guest_state_valid(vcpu)) {
3187                 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
3188
3189                 if (err == EMULATE_DO_MMIO)
3190                         break;
3191
3192                 if (err != EMULATE_DONE) {
3193                         kvm_report_emulation_failure(vcpu, "emulation failure");
3194                         break;
3195                 }
3196
3197                 if (signal_pending(current))
3198                         break;
3199                 if (need_resched())
3200                         schedule();
3201         }
3202
3203         preempt_disable();
3204         local_irq_disable();
3205
3206         vmx->invalid_state_emulation_result = err;
3207 }
3208
3209 /*
3210  * The exit handlers return 1 if the exit was handled fully and guest execution
3211  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
3212  * to be done to userspace and return 0.
3213  */
3214 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3215                                       struct kvm_run *kvm_run) = {
3216         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
3217         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
3218         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
3219         [EXIT_REASON_NMI_WINDOW]              = handle_nmi_window,
3220         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
3221         [EXIT_REASON_CR_ACCESS]               = handle_cr,
3222         [EXIT_REASON_DR_ACCESS]               = handle_dr,
3223         [EXIT_REASON_CPUID]                   = handle_cpuid,
3224         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
3225         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
3226         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
3227         [EXIT_REASON_HLT]                     = handle_halt,
3228         [EXIT_REASON_INVLPG]                  = handle_invlpg,
3229         [EXIT_REASON_VMCALL]                  = handle_vmcall,
3230         [EXIT_REASON_VMCLEAR]                 = handle_vmx_insn,
3231         [EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
3232         [EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
3233         [EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
3234         [EXIT_REASON_VMREAD]                  = handle_vmx_insn,
3235         [EXIT_REASON_VMRESUME]                = handle_vmx_insn,
3236         [EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
3237         [EXIT_REASON_VMOFF]                   = handle_vmx_insn,
3238         [EXIT_REASON_VMON]                    = handle_vmx_insn,
3239         [EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
3240         [EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
3241         [EXIT_REASON_WBINVD]                  = handle_wbinvd,
3242         [EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
3243         [EXIT_REASON_EPT_VIOLATION]           = handle_ept_violation,
3244         [EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
3245 };
3246
3247 static const int kvm_vmx_max_exit_handlers =
3248         ARRAY_SIZE(kvm_vmx_exit_handlers);
3249
3250 /*
3251  * The guest has exited.  See if we can fix it or if we need userspace
3252  * assistance.
3253  */
3254 static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3255 {
3256         struct vcpu_vmx *vmx = to_vmx(vcpu);
3257         u32 exit_reason = vmx->exit_reason;
3258         u32 vectoring_info = vmx->idt_vectoring_info;
3259
3260         KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
3261                     (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
3262
3263         /* If we need to emulate an MMIO from handle_invalid_guest_state
3264          * we just return 0 */
3265         if (vmx->emulation_required && emulate_invalid_guest_state) {
3266                 if (guest_state_valid(vcpu))
3267                         vmx->emulation_required = 0;
3268                 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3269         }
3270
3271         /* Access CR3 don't cause VMExit in paging mode, so we need
3272          * to sync with guest real CR3. */
3273         if (enable_ept && is_paging(vcpu))
3274                 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3275
3276         if (unlikely(vmx->fail)) {
3277                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3278                 kvm_run->fail_entry.hardware_entry_failure_reason
3279                         = vmcs_read32(VM_INSTRUCTION_ERROR);
3280                 return 0;
3281         }
3282
3283         if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
3284                         (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
3285                         exit_reason != EXIT_REASON_EPT_VIOLATION &&
3286                         exit_reason != EXIT_REASON_TASK_SWITCH))
3287                 printk(KERN_WARNING "%s: unexpected, valid vectoring info "
3288                        "(0x%x) and exit reason is 0x%x\n",
3289                        __func__, vectoring_info, exit_reason);
3290
3291         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
3292                 if (vmx_interrupt_allowed(vcpu)) {
3293                         vmx->soft_vnmi_blocked = 0;
3294                 } else if (vmx->vnmi_blocked_time > 1000000000LL &&
3295                            vcpu->arch.nmi_pending) {
3296                         /*
3297                          * This CPU don't support us in finding the end of an
3298                          * NMI-blocked window if the guest runs with IRQs
3299                          * disabled. So we pull the trigger after 1 s of
3300                          * futile waiting, but inform the user about this.
3301                          */
3302                         printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
3303                                "state on VCPU %d after 1 s timeout\n",
3304                                __func__, vcpu->vcpu_id);
3305                         vmx->soft_vnmi_blocked = 0;
3306                 }
3307         }
3308
3309         if (exit_reason < kvm_vmx_max_exit_handlers
3310             && kvm_vmx_exit_handlers[exit_reason])
3311                 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
3312         else {
3313                 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
3314                 kvm_run->hw.hardware_exit_reason = exit_reason;
3315         }
3316         return 0;
3317 }
3318
3319 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3320 {
3321         if (irr == -1 || tpr < irr) {
3322                 vmcs_write32(TPR_THRESHOLD, 0);
3323                 return;
3324         }
3325
3326         vmcs_write32(TPR_THRESHOLD, irr);
3327 }
3328
3329 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3330 {
3331         u32 exit_intr_info;
3332         u32 idt_vectoring_info = vmx->idt_vectoring_info;
3333         bool unblock_nmi;
3334         u8 vector;
3335         int type;
3336         bool idtv_info_valid;
3337
3338         exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3339
3340         vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
3341
3342         /* Handle machine checks before interrupts are enabled */
3343         if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
3344             || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3345                 && is_machine_check(exit_intr_info)))
3346                 kvm_machine_check();
3347
3348         /* We need to handle NMIs before interrupts are enabled */
3349         if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
3350             (exit_intr_info & INTR_INFO_VALID_MASK)) {
3351                 KVMTRACE_0D(NMI, &vmx->vcpu, handler);
3352                 asm("int $2");
3353         }
3354
3355         idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3356
3357         if (cpu_has_virtual_nmis()) {
3358                 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3359                 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3360                 /*
3361                  * SDM 3: 27.7.1.2 (September 2008)
3362                  * Re-set bit "block by NMI" before VM entry if vmexit caused by
3363                  * a guest IRET fault.
3364                  * SDM 3: 23.2.2 (September 2008)
3365                  * Bit 12 is undefined in any of the following cases:
3366                  *  If the VM exit sets the valid bit in the IDT-vectoring
3367                  *   information field.
3368                  *  If the VM exit is due to a double fault.
3369                  */
3370                 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
3371                     vector != DF_VECTOR && !idtv_info_valid)
3372                         vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3373                                       GUEST_INTR_STATE_NMI);
3374         } else if (unlikely(vmx->soft_vnmi_blocked))
3375                 vmx->vnmi_blocked_time +=
3376                         ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
3377
3378         vmx->vcpu.arch.nmi_injected = false;
3379         kvm_clear_exception_queue(&vmx->vcpu);
3380         kvm_clear_interrupt_queue(&vmx->vcpu);
3381
3382         if (!idtv_info_valid)
3383                 return;
3384
3385         vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3386         type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3387
3388         switch (type) {
3389         case INTR_TYPE_NMI_INTR:
3390                 vmx->vcpu.arch.nmi_injected = true;
3391                 /*
3392                  * SDM 3: 27.7.1.2 (September 2008)
3393                  * Clear bit "block by NMI" before VM entry if a NMI
3394                  * delivery faulted.
3395                  */
3396                 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3397                                 GUEST_INTR_STATE_NMI);
3398                 break;
3399         case INTR_TYPE_SOFT_EXCEPTION:
3400                 vmx->vcpu.arch.event_exit_inst_len =
3401                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3402                 /* fall through */
3403         case INTR_TYPE_HARD_EXCEPTION:
3404                 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3405                         u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3406                         kvm_queue_exception_e(&vmx->vcpu, vector, err);
3407                 } else
3408                         kvm_queue_exception(&vmx->vcpu, vector);
3409                 break;
3410         case INTR_TYPE_SOFT_INTR:
3411                 vmx->vcpu.arch.event_exit_inst_len =
3412                         vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3413                 /* fall through */
3414         case INTR_TYPE_EXT_INTR:
3415                 kvm_queue_interrupt(&vmx->vcpu, vector,
3416                         type == INTR_TYPE_SOFT_INTR);
3417                 break;
3418         default:
3419                 break;
3420         }
3421 }
3422
3423 /*
3424  * Failure to inject an interrupt should give us the information
3425  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
3426  * when fetching the interrupt redirection bitmap in the real-mode
3427  * tss, this doesn't happen.  So we do it ourselves.
3428  */
3429 static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3430 {
3431         vmx->rmode.irq.pending = 0;
3432         if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
3433                 return;
3434         kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
3435         if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
3436                 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
3437                 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
3438                 return;
3439         }
3440         vmx->idt_vectoring_info =
3441                 VECTORING_INFO_VALID_MASK
3442                 | INTR_TYPE_EXT_INTR
3443                 | vmx->rmode.irq.vector;
3444 }
3445
3446 #ifdef CONFIG_X86_64
3447 #define R "r"
3448 #define Q "q"
3449 #else
3450 #define R "e"
3451 #define Q "l"
3452 #endif
3453
3454 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3455 {
3456         struct vcpu_vmx *vmx = to_vmx(vcpu);
3457
3458         if (enable_ept && is_paging(vcpu)) {
3459                 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3460                 ept_load_pdptrs(vcpu);
3461         }
3462         /* Record the guest's net vcpu time for enforced NMI injections. */
3463         if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3464                 vmx->entry_time = ktime_get();
3465
3466         /* Handle invalid guest state instead of entering VMX */
3467         if (vmx->emulation_required && emulate_invalid_guest_state) {
3468                 handle_invalid_guest_state(vcpu, kvm_run);
3469                 return;
3470         }
3471
3472         if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3473                 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3474         if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3475                 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3476
3477         /* When single-stepping over STI and MOV SS, we must clear the
3478          * corresponding interruptibility bits in the guest state. Otherwise
3479          * vmentry fails as it then expects bit 14 (BS) in pending debug
3480          * exceptions being set, but that's not correct for the guest debugging
3481          * case. */
3482         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3483                 vmx_set_interrupt_shadow(vcpu, 0);
3484
3485         /*
3486          * Loading guest fpu may have cleared host cr0.ts
3487          */
3488         vmcs_writel(HOST_CR0, read_cr0());
3489
3490         set_debugreg(vcpu->arch.dr6, 6);
3491
3492         asm(
3493                 /* Store host registers */
3494                 "push %%"R"dx; push %%"R"bp;"
3495                 "push %%"R"cx \n\t"
3496                 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3497                 "je 1f \n\t"
3498                 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
3499                 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3500                 "1: \n\t"
3501                 /* Check if vmlaunch of vmresume is needed */
3502                 "cmpl $0, %c[launched](%0) \n\t"
3503                 /* Load guest registers.  Don't clobber flags. */
3504                 "mov %c[cr2](%0), %%"R"ax \n\t"
3505                 "mov %%"R"ax, %%cr2 \n\t"
3506                 "mov %c[rax](%0), %%"R"ax \n\t"
3507                 "mov %c[rbx](%0), %%"R"bx \n\t"
3508                 "mov %c[rdx](%0), %%"R"dx \n\t"
3509                 "mov %c[rsi](%0), %%"R"si \n\t"
3510                 "mov %c[rdi](%0), %%"R"di \n\t"
3511                 "mov %c[rbp](%0), %%"R"bp \n\t"
3512 #ifdef CONFIG_X86_64
3513                 "mov %c[r8](%0),  %%r8  \n\t"
3514                 "mov %c[r9](%0),  %%r9  \n\t"
3515                 "mov %c[r10](%0), %%r10 \n\t"
3516                 "mov %c[r11](%0), %%r11 \n\t"
3517                 "mov %c[r12](%0), %%r12 \n\t"
3518                 "mov %c[r13](%0), %%r13 \n\t"
3519                 "mov %c[r14](%0), %%r14 \n\t"
3520                 "mov %c[r15](%0), %%r15 \n\t"
3521 #endif
3522                 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3523
3524                 /* Enter guest mode */
3525                 "jne .Llaunched \n\t"
3526                 __ex(ASM_VMX_VMLAUNCH) "\n\t"
3527                 "jmp .Lkvm_vmx_return \n\t"
3528                 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
3529                 ".Lkvm_vmx_return: "
3530                 /* Save guest registers, load host registers, keep flags */
3531                 "xchg %0,     (%%"R"sp) \n\t"
3532                 "mov %%"R"ax, %c[rax](%0) \n\t"
3533                 "mov %%"R"bx, %c[rbx](%0) \n\t"
3534                 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3535                 "mov %%"R"dx, %c[rdx](%0) \n\t"
3536                 "mov %%"R"si, %c[rsi](%0) \n\t"
3537                 "mov %%"R"di, %c[rdi](%0) \n\t"
3538                 "mov %%"R"bp, %c[rbp](%0) \n\t"
3539 #ifdef CONFIG_X86_64
3540                 "mov %%r8,  %c[r8](%0) \n\t"
3541                 "mov %%r9,  %c[r9](%0) \n\t"
3542                 "mov %%r10, %c[r10](%0) \n\t"
3543                 "mov %%r11, %c[r11](%0) \n\t"
3544                 "mov %%r12, %c[r12](%0) \n\t"
3545                 "mov %%r13, %c[r13](%0) \n\t"
3546                 "mov %%r14, %c[r14](%0) \n\t"
3547                 "mov %%r15, %c[r15](%0) \n\t"
3548 #endif
3549                 "mov %%cr2, %%"R"ax   \n\t"
3550                 "mov %%"R"ax, %c[cr2](%0) \n\t"
3551
3552                 "pop  %%"R"bp; pop  %%"R"bp; pop  %%"R"dx \n\t"
3553                 "setbe %c[fail](%0) \n\t"
3554               : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3555                 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3556                 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3557                 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
3558                 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3559                 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3560                 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
3561                 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
3562                 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
3563                 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
3564                 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
3565 #ifdef CONFIG_X86_64
3566                 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
3567                 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
3568                 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
3569                 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
3570                 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
3571                 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
3572                 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
3573                 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
3574 #endif
3575                 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
3576               : "cc", "memory"
3577                 , R"bx", R"di", R"si"
3578 #ifdef CONFIG_X86_64
3579                 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3580 #endif
3581               );
3582
3583         vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
3584                                   | (1 << VCPU_EXREG_PDPTR));
3585         vcpu->arch.regs_dirty = 0;
3586
3587         get_debugreg(vcpu->arch.dr6, 6);
3588
3589         vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3590         if (vmx->rmode.irq.pending)
3591                 fixup_rmode_irq(vmx);
3592
3593         asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
3594         vmx->launched = 1;
3595
3596         vmx_complete_interrupts(vmx);
3597 }
3598
3599 #undef R
3600 #undef Q
3601
3602 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3603 {
3604         struct vcpu_vmx *vmx = to_vmx(vcpu);
3605
3606         if (vmx->vmcs) {
3607                 vcpu_clear(vmx);
3608                 free_vmcs(vmx->vmcs);
3609                 vmx->vmcs = NULL;
3610         }
3611 }
3612
3613 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3614 {
3615         struct vcpu_vmx *vmx = to_vmx(vcpu);
3616
3617         spin_lock(&vmx_vpid_lock);
3618         if (vmx->vpid != 0)
3619                 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3620         spin_unlock(&vmx_vpid_lock);
3621         vmx_free_vmcs(vcpu);
3622         kfree(vmx->host_msrs);
3623         kfree(vmx->guest_msrs);
3624         kvm_vcpu_uninit(vcpu);
3625         kmem_cache_free(kvm_vcpu_cache, vmx);
3626 }
3627
3628 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3629 {
3630         int err;
3631         struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
3632         int cpu;
3633
3634         if (!vmx)
3635                 return ERR_PTR(-ENOMEM);
3636
3637         allocate_vpid(vmx);
3638
3639         err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3640         if (err)
3641                 goto free_vcpu;
3642
3643         vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3644         if (!vmx->guest_msrs) {
3645                 err = -ENOMEM;
3646                 goto uninit_vcpu;
3647         }
3648
3649         vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3650         if (!vmx->host_msrs)
3651                 goto free_guest_msrs;
3652
3653         vmx->vmcs = alloc_vmcs();
3654         if (!vmx->vmcs)
3655                 goto free_msrs;
3656
3657         vmcs_clear(vmx->vmcs);
3658
3659         cpu = get_cpu();
3660         vmx_vcpu_load(&vmx->vcpu, cpu);
3661         err = vmx_vcpu_setup(vmx);
3662         vmx_vcpu_put(&vmx->vcpu);
3663         put_cpu();
3664         if (err)
3665                 goto free_vmcs;
3666         if (vm_need_virtualize_apic_accesses(kvm))
3667                 if (alloc_apic_access_page(kvm) != 0)
3668                         goto free_vmcs;
3669
3670         if (enable_ept)
3671                 if (alloc_identity_pagetable(kvm) != 0)
3672                         goto free_vmcs;
3673
3674         return &vmx->vcpu;
3675
3676 free_vmcs:
3677         free_vmcs(vmx->vmcs);
3678 free_msrs:
3679         kfree(vmx->host_msrs);
3680 free_guest_msrs:
3681         kfree(vmx->guest_msrs);
3682 uninit_vcpu:
3683         kvm_vcpu_uninit(&vmx->vcpu);
3684 free_vcpu:
3685         kmem_cache_free(kvm_vcpu_cache, vmx);
3686         return ERR_PTR(err);
3687 }
3688
3689 static void __init vmx_check_processor_compat(void *rtn)
3690 {
3691         struct vmcs_config vmcs_conf;
3692
3693         *(int *)rtn = 0;
3694         if (setup_vmcs_config(&vmcs_conf) < 0)
3695                 *(int *)rtn = -EIO;
3696         if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
3697                 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
3698                                 smp_processor_id());
3699                 *(int *)rtn = -EIO;
3700         }
3701 }
3702
3703 static int get_ept_level(void)
3704 {
3705         return VMX_EPT_DEFAULT_GAW + 1;
3706 }
3707
3708 static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3709 {
3710         u64 ret;
3711
3712         /* For VT-d and EPT combination
3713          * 1. MMIO: always map as UC
3714          * 2. EPT with VT-d:
3715          *   a. VT-d without snooping control feature: can't guarantee the
3716          *      result, try to trust guest.
3717          *   b. VT-d with snooping control feature: snooping control feature of
3718          *      VT-d engine can guarantee the cache correctness. Just set it
3719          *      to WB to keep consistent with host. So the same as item 3.
3720          * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
3721          *    consistent with host MTRR
3722          */
3723         if (is_mmio)
3724                 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
3725         else if (vcpu->kvm->arch.iommu_domain &&
3726                 !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
3727                 ret = kvm_get_guest_memory_type(vcpu, gfn) <<
3728                       VMX_EPT_MT_EPTE_SHIFT;
3729         else
3730                 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3731                         | VMX_EPT_IGMT_BIT;
3732
3733         return ret;
3734 }
3735
3736 static struct kvm_x86_ops vmx_x86_ops = {
3737         .cpu_has_kvm_support = cpu_has_kvm_support,
3738         .disabled_by_bios = vmx_disabled_by_bios,
3739         .hardware_setup = hardware_setup,
3740         .hardware_unsetup = hardware_unsetup,
3741         .check_processor_compatibility = vmx_check_processor_compat,
3742         .hardware_enable = hardware_enable,
3743         .hardware_disable = hardware_disable,
3744         .cpu_has_accelerated_tpr = report_flexpriority,
3745
3746         .vcpu_create = vmx_create_vcpu,
3747         .vcpu_free = vmx_free_vcpu,
3748         .vcpu_reset = vmx_vcpu_reset,
3749
3750         .prepare_guest_switch = vmx_save_host_state,
3751         .vcpu_load = vmx_vcpu_load,
3752         .vcpu_put = vmx_vcpu_put,
3753
3754         .set_guest_debug = set_guest_debug,
3755         .get_msr = vmx_get_msr,
3756         .set_msr = vmx_set_msr,
3757         .get_segment_base = vmx_get_segment_base,
3758         .get_segment = vmx_get_segment,
3759         .set_segment = vmx_set_segment,
3760         .get_cpl = vmx_get_cpl,
3761         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
3762         .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3763         .set_cr0 = vmx_set_cr0,
3764         .set_cr3 = vmx_set_cr3,
3765         .set_cr4 = vmx_set_cr4,
3766         .set_efer = vmx_set_efer,
3767         .get_idt = vmx_get_idt,
3768         .set_idt = vmx_set_idt,
3769         .get_gdt = vmx_get_gdt,
3770         .set_gdt = vmx_set_gdt,
3771         .cache_reg = vmx_cache_reg,
3772         .get_rflags = vmx_get_rflags,
3773         .set_rflags = vmx_set_rflags,
3774
3775         .tlb_flush = vmx_flush_tlb,
3776
3777         .run = vmx_vcpu_run,
3778         .handle_exit = vmx_handle_exit,
3779         .skip_emulated_instruction = skip_emulated_instruction,
3780         .set_interrupt_shadow = vmx_set_interrupt_shadow,
3781         .get_interrupt_shadow = vmx_get_interrupt_shadow,
3782         .patch_hypercall = vmx_patch_hypercall,
3783         .set_irq = vmx_inject_irq,
3784         .set_nmi = vmx_inject_nmi,
3785         .queue_exception = vmx_queue_exception,
3786         .interrupt_allowed = vmx_interrupt_allowed,
3787         .nmi_allowed = vmx_nmi_allowed,
3788         .enable_nmi_window = enable_nmi_window,
3789         .enable_irq_window = enable_irq_window,
3790         .update_cr8_intercept = update_cr8_intercept,
3791
3792         .set_tss_addr = vmx_set_tss_addr,
3793         .get_tdp_level = get_ept_level,
3794         .get_mt_mask = vmx_get_mt_mask,
3795 };
3796
3797 static int __init vmx_init(void)
3798 {
3799         int r;
3800
3801         vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3802         if (!vmx_io_bitmap_a)
3803                 return -ENOMEM;
3804
3805         vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
3806         if (!vmx_io_bitmap_b) {
3807                 r = -ENOMEM;
3808                 goto out;
3809         }
3810
3811         vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
3812         if (!vmx_msr_bitmap_legacy) {
3813                 r = -ENOMEM;
3814                 goto out1;
3815         }
3816
3817         vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
3818         if (!vmx_msr_bitmap_longmode) {
3819                 r = -ENOMEM;
3820                 goto out2;
3821         }
3822
3823         /*
3824          * Allow direct access to the PC debug port (it is often used for I/O
3825          * delays, but the vmexits simply slow things down).
3826          */
3827         memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
3828         clear_bit(0x80, vmx_io_bitmap_a);
3829
3830         memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
3831
3832         memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
3833         memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
3834
3835         set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
3836
3837         r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
3838         if (r)
3839                 goto out3;
3840
3841         vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
3842         vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
3843         vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
3844         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
3845         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
3846         vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
3847
3848         if (enable_ept) {
3849                 bypass_guest_pf = 0;
3850                 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3851                         VMX_EPT_WRITABLE_MASK);
3852                 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3853                                 VMX_EPT_EXECUTABLE_MASK);
3854                 kvm_enable_tdp();
3855         } else
3856                 kvm_disable_tdp();
3857
3858         if (bypass_guest_pf)
3859                 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
3860
3861         ept_sync_global();
3862
3863         return 0;
3864
3865 out3:
3866         free_page((unsigned long)vmx_msr_bitmap_longmode);
3867 out2:
3868         free_page((unsigned long)vmx_msr_bitmap_legacy);
3869 out1:
3870         free_page((unsigned long)vmx_io_bitmap_b);
3871 out:
3872         free_page((unsigned long)vmx_io_bitmap_a);
3873         return r;
3874 }
3875
3876 static void __exit vmx_exit(void)
3877 {
3878         free_page((unsigned long)vmx_msr_bitmap_legacy);
3879         free_page((unsigned long)vmx_msr_bitmap_longmode);
3880         free_page((unsigned long)vmx_io_bitmap_b);
3881         free_page((unsigned long)vmx_io_bitmap_a);
3882
3883         kvm_exit();
3884 }
3885
3886 module_init(vmx_init)
3887 module_exit(vmx_exit)