Pileus Git - ~andy/linux/blob - arch/powerpc/kvm/book3s_64_mmu_hv.c

   1 /*
   2  * This program is free software; you can redistribute it and/or modify
   3  * it under the terms of the GNU General Public License, version 2, as
   4  * published by the Free Software Foundation.
   5  *
   6  * This program is distributed in the hope that it will be useful,
   7  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   8  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   9  * GNU General Public License for more details.
  10  *
  11  * You should have received a copy of the GNU General Public License
  12  * along with this program; if not, write to the Free Software
  13  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  14  *
  15  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  16  */
  17
  18 #include <linux/types.h>
  19 #include <linux/string.h>
  20 #include <linux/kvm.h>
  21 #include <linux/kvm_host.h>
  22 #include <linux/highmem.h>
  23 #include <linux/gfp.h>
  24 #include <linux/slab.h>
  25 #include <linux/hugetlb.h>
  26 #include <linux/vmalloc.h>
  27
  28 #include <asm/tlbflush.h>
  29 #include <asm/kvm_ppc.h>
  30 #include <asm/kvm_book3s.h>
  31 #include <asm/mmu-hash64.h>
  32 #include <asm/hvcall.h>
  33 #include <asm/synch.h>
  34 #include <asm/ppc-opcode.h>
  35 #include <asm/cputable.h>
  36
  37 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
  38 #define MAX_LPID_970    63
  39 #define NR_LPIDS        (LPID_RSVD + 1)
  40 unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
  41
  42 long kvmppc_alloc_hpt(struct kvm *kvm)
  43 {
  44         unsigned long hpt;
  45         unsigned long lpid;
  46         struct revmap_entry *rev;
  47         struct kvmppc_linear_info *li;
  48
  49         /* Allocate guest's hashed page table */
  50         li = kvm_alloc_hpt();
  51         if (li) {
  52                 /* using preallocated memory */
  53                 hpt = (ulong)li->base_virt;
  54                 kvm->arch.hpt_li = li;
  55         } else {
  56                 /* using dynamic memory */
  57                 hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
  58                                        __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT);
  59         }
  60
  61         if (!hpt) {
  62                 pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n");
  63                 return -ENOMEM;
  64         }
  65         kvm->arch.hpt_virt = hpt;
  66
  67         /* Allocate reverse map array */
  68         rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE);
  69         if (!rev) {
  70                 pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
  71                 goto out_freehpt;
  72         }
  73         kvm->arch.revmap = rev;
  74
  75         /* Allocate the guest's logical partition ID */
  76         do {
  77                 lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
  78                 if (lpid >= NR_LPIDS) {
  79                         pr_err("kvm_alloc_hpt: No LPIDs free\n");
  80                         goto out_freeboth;
  81                 }
  82         } while (test_and_set_bit(lpid, lpid_inuse));
  83
  84         kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
  85         kvm->arch.lpid = lpid;
  86
  87         pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid);
  88         return 0;
  89
  90  out_freeboth:
  91         vfree(rev);
  92  out_freehpt:
  93         free_pages(hpt, HPT_ORDER - PAGE_SHIFT);
  94         return -ENOMEM;
  95 }
  96
  97 void kvmppc_free_hpt(struct kvm *kvm)
  98 {
  99         clear_bit(kvm->arch.lpid, lpid_inuse);
 100         vfree(kvm->arch.revmap);
 101         if (kvm->arch.hpt_li)
 102                 kvm_release_hpt(kvm->arch.hpt_li);
 103         else
 104                 free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT);
 105 }
 106
 107 /* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
 108 static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
 109 {
 110         return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
 111 }
 112
 113 /* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
 114 static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
 115 {
 116         return (pgsize == 0x10000) ? 0x1000 : 0;
 117 }
 118
 119 void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 120                      unsigned long porder)
 121 {
 122         unsigned long i;
 123         unsigned long npages;
 124         unsigned long hp_v, hp_r;
 125         unsigned long addr, hash;
 126         unsigned long psize;
 127         unsigned long hp0, hp1;
 128         long ret;
 129
 130         psize = 1ul << porder;
 131         npages = memslot->npages >> (porder - PAGE_SHIFT);
 132
 133         /* VRMA can't be > 1TB */
 134         if (npages > 1ul << (40 - porder))
 135                 npages = 1ul << (40 - porder);
 136         /* Can't use more than 1 HPTE per HPTEG */
 137         if (npages > HPT_NPTEG)
 138                 npages = HPT_NPTEG;
 139
 140         hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 141                 HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
 142         hp1 = hpte1_pgsize_encoding(psize) |
 143                 HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 144
 145         for (i = 0; i < npages; ++i) {
 146                 addr = i << porder;
 147                 /* can't use hpt_hash since va > 64 bits */
 148                 hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK;
 149                 /*
 150                  * We assume that the hash table is empty and no
 151                  * vcpus are using it at this stage.  Since we create
 152                  * at most one HPTE per HPTEG, we just assume entry 7
 153                  * is available and use it.
 154                  */
 155                 hash = (hash << 3) + 7;
 156                 hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
 157                 hp_r = hp1 | addr;
 158                 ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
 159                 if (ret != H_SUCCESS) {
 160                         pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
 161                                addr, ret);
 162                         break;
 163                 }
 164         }
 165 }
 166
 167 int kvmppc_mmu_hv_init(void)
 168 {
 169         unsigned long host_lpid, rsvd_lpid;
 170
 171         if (!cpu_has_feature(CPU_FTR_HVMODE))
 172                 return -EINVAL;
 173
 174         memset(lpid_inuse, 0, sizeof(lpid_inuse));
 175
 176         if (cpu_has_feature(CPU_FTR_ARCH_206)) {
 177                 host_lpid = mfspr(SPRN_LPID);   /* POWER7 */
 178                 rsvd_lpid = LPID_RSVD;
 179         } else {
 180                 host_lpid = 0;                  /* PPC970 */
 181                 rsvd_lpid = MAX_LPID_970;
 182         }
 183
 184         set_bit(host_lpid, lpid_inuse);
 185         /* rsvd_lpid is reserved for use in partition switching */
 186         set_bit(rsvd_lpid, lpid_inuse);
 187
 188         return 0;
 189 }
 190
 191 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 192 {
 193 }
 194
 195 static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 196 {
 197         kvmppc_set_msr(vcpu, MSR_SF | MSR_ME);
 198 }
 199
 200 /*
 201  * This is called to get a reference to a guest page if there isn't
 202  * one already in the kvm->arch.slot_phys[][] arrays.
 203  */
 204 static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
 205                                   struct kvm_memory_slot *memslot,
 206                                   unsigned long psize)
 207 {
 208         unsigned long start;
 209         long np, err;
 210         struct page *page, *hpage, *pages[1];
 211         unsigned long s, pgsize;
 212         unsigned long *physp;
 213         unsigned int is_io, got, pgorder;
 214         struct vm_area_struct *vma;
 215         unsigned long pfn, i, npages;
 216
 217         physp = kvm->arch.slot_phys[memslot->id];
 218         if (!physp)
 219                 return -EINVAL;
 220         if (physp[gfn - memslot->base_gfn])
 221                 return 0;
 222
 223         is_io = 0;
 224         got = 0;
 225         page = NULL;
 226         pgsize = psize;
 227         err = -EINVAL;
 228         start = gfn_to_hva_memslot(memslot, gfn);
 229
 230         /* Instantiate and get the page we want access to */
 231         np = get_user_pages_fast(start, 1, 1, pages);
 232         if (np != 1) {
 233                 /* Look up the vma for the page */
 234                 down_read(&current->mm->mmap_sem);
 235                 vma = find_vma(current->mm, start);
 236                 if (!vma || vma->vm_start > start ||
 237                     start + psize > vma->vm_end ||
 238                     !(vma->vm_flags & VM_PFNMAP))
 239                         goto up_err;
 240                 is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
 241                 pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 242                 /* check alignment of pfn vs. requested page size */
 243                 if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1)))
 244                         goto up_err;
 245                 up_read(&current->mm->mmap_sem);
 246
 247         } else {
 248                 page = pages[0];
 249                 got = KVMPPC_GOT_PAGE;
 250
 251                 /* See if this is a large page */
 252                 s = PAGE_SIZE;
 253                 if (PageHuge(page)) {
 254                         hpage = compound_head(page);
 255                         s <<= compound_order(hpage);
 256                         /* Get the whole large page if slot alignment is ok */
 257                         if (s > psize && slot_is_aligned(memslot, s) &&
 258                             !(memslot->userspace_addr & (s - 1))) {
 259                                 start &= ~(s - 1);
 260                                 pgsize = s;
 261                                 page = hpage;
 262                         }
 263                 }
 264                 if (s < psize)
 265                         goto out;
 266                 pfn = page_to_pfn(page);
 267         }
 268
 269         npages = pgsize >> PAGE_SHIFT;
 270         pgorder = __ilog2(npages);
 271         physp += (gfn - memslot->base_gfn) & ~(npages - 1);
 272         spin_lock(&kvm->arch.slot_phys_lock);
 273         for (i = 0; i < npages; ++i) {
 274                 if (!physp[i]) {
 275                         physp[i] = ((pfn + i) << PAGE_SHIFT) +
 276                                 got + is_io + pgorder;
 277                         got = 0;
 278                 }
 279         }
 280         spin_unlock(&kvm->arch.slot_phys_lock);
 281         err = 0;
 282
 283  out:
 284         if (got) {
 285                 if (PageHuge(page))
 286                         page = compound_head(page);
 287                 put_page(page);
 288         }
 289         return err;
 290
 291  up_err:
 292         up_read(&current->mm->mmap_sem);
 293         return err;
 294 }
 295
 296 /*
 297  * We come here on a H_ENTER call from the guest when we are not
 298  * using mmu notifiers and we don't have the requested page pinned
 299  * already.
 300  */
 301 long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 302                         long pte_index, unsigned long pteh, unsigned long ptel)
 303 {
 304         struct kvm *kvm = vcpu->kvm;
 305         unsigned long psize, gpa, gfn;
 306         struct kvm_memory_slot *memslot;
 307         long ret;
 308
 309         if (kvm->arch.using_mmu_notifiers)
 310                 goto do_insert;
 311
 312         psize = hpte_page_size(pteh, ptel);
 313         if (!psize)
 314                 return H_PARAMETER;
 315
 316         pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
 317
 318         /* Find the memslot (if any) for this address */
 319         gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 320         gfn = gpa >> PAGE_SHIFT;
 321         memslot = gfn_to_memslot(kvm, gfn);
 322         if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) {
 323                 if (!slot_is_aligned(memslot, psize))
 324                         return H_PARAMETER;
 325                 if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0)
 326                         return H_PARAMETER;
 327         }
 328
 329  do_insert:
 330         /* Protect linux PTE lookup from page table destruction */
 331         rcu_read_lock_sched();  /* this disables preemption too */
 332         vcpu->arch.pgdir = current->mm->pgd;
 333         ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
 334         rcu_read_unlock_sched();
 335         if (ret == H_TOO_HARD) {
 336                 /* this can't happen */
 337                 pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
 338                 ret = H_RESOURCE;       /* or something */
 339         }
 340         return ret;
 341
 342 }
 343
 344 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
 345                                                          gva_t eaddr)
 346 {
 347         u64 mask;
 348         int i;
 349
 350         for (i = 0; i < vcpu->arch.slb_nr; i++) {
 351                 if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
 352                         continue;
 353
 354                 if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
 355                         mask = ESID_MASK_1T;
 356                 else
 357                         mask = ESID_MASK;
 358
 359                 if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
 360                         return &vcpu->arch.slb[i];
 361         }
 362         return NULL;
 363 }
 364
 365 static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 366                         unsigned long ea)
 367 {
 368         unsigned long ra_mask;
 369
 370         ra_mask = hpte_page_size(v, r) - 1;
 371         return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
 372 }
 373
 374 static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 375                         struct kvmppc_pte *gpte, bool data)
 376 {
 377         struct kvm *kvm = vcpu->kvm;
 378         struct kvmppc_slb *slbe;
 379         unsigned long slb_v;
 380         unsigned long pp, key;
 381         unsigned long v, gr;
 382         unsigned long *hptep;
 383         int index;
 384         int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
 385
 386         /* Get SLB entry */
 387         if (virtmode) {
 388                 slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
 389                 if (!slbe)
 390                         return -EINVAL;
 391                 slb_v = slbe->origv;
 392         } else {
 393                 /* real mode access */
 394                 slb_v = vcpu->kvm->arch.vrma_slb_v;
 395         }
 396
 397         /* Find the HPTE in the hash table */
 398         index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
 399                                          HPTE_V_VALID | HPTE_V_ABSENT);
 400         if (index < 0)
 401                 return -ENOENT;
 402         hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 403         v = hptep[0] & ~HPTE_V_HVLOCK;
 404         gr = kvm->arch.revmap[index].guest_rpte;
 405
 406         /* Unlock the HPTE */
 407         asm volatile("lwsync" : : : "memory");
 408         hptep[0] = v;
 409
 410         gpte->eaddr = eaddr;
 411         gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
 412
 413         /* Get PP bits and key for permission check */
 414         pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
 415         key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
 416         key &= slb_v;
 417
 418         /* Calculate permissions */
 419         gpte->may_read = hpte_read_permission(pp, key);
 420         gpte->may_write = hpte_write_permission(pp, key);
 421         gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
 422
 423         /* Storage key permission check for POWER7 */
 424         if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) {
 425                 int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
 426                 if (amrfield & 1)
 427                         gpte->may_read = 0;
 428                 if (amrfield & 2)
 429                         gpte->may_write = 0;
 430         }
 431
 432         /* Get the guest physical address */
 433         gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
 434         return 0;
 435 }
 436
 437 /*
 438  * Quick test for whether an instruction is a load or a store.
 439  * If the instruction is a load or a store, then this will indicate
 440  * which it is, at least on server processors.  (Embedded processors
 441  * have some external PID instructions that don't follow the rule
 442  * embodied here.)  If the instruction isn't a load or store, then
 443  * this doesn't return anything useful.
 444  */
 445 static int instruction_is_store(unsigned int instr)
 446 {
 447         unsigned int mask;
 448
 449         mask = 0x10000000;
 450         if ((instr & 0xfc000000) == 0x7c000000)
 451                 mask = 0x100;           /* major opcode 31 */
 452         return (instr & mask) != 0;
 453 }
 454
 455 static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
 456                                   unsigned long gpa, int is_store)
 457 {
 458         int ret;
 459         u32 last_inst;
 460         unsigned long srr0 = kvmppc_get_pc(vcpu);
 461
 462         /* We try to load the last instruction.  We don't let
 463          * emulate_instruction do it as it doesn't check what
 464          * kvmppc_ld returns.
 465          * If we fail, we just return to the guest and try executing it again.
 466          */
 467         if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) {
 468                 ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false);
 469                 if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED)
 470                         return RESUME_GUEST;
 471                 vcpu->arch.last_inst = last_inst;
 472         }
 473
 474         /*
 475          * WARNING: We do not know for sure whether the instruction we just
 476          * read from memory is the same that caused the fault in the first
 477          * place.  If the instruction we read is neither an load or a store,
 478          * then it can't access memory, so we don't need to worry about
 479          * enforcing access permissions.  So, assuming it is a load or
 480          * store, we just check that its direction (load or store) is
 481          * consistent with the original fault, since that's what we
 482          * checked the access permissions against.  If there is a mismatch
 483          * we just return and retry the instruction.
 484          */
 485
 486         if (instruction_is_store(vcpu->arch.last_inst) != !!is_store)
 487                 return RESUME_GUEST;
 488
 489         /*
 490          * Emulated accesses are emulated by looking at the hash for
 491          * translation once, then performing the access later. The
 492          * translation could be invalidated in the meantime in which
 493          * point performing the subsequent memory access on the old
 494          * physical address could possibly be a security hole for the
 495          * guest (but not the host).
 496          *
 497          * This is less of an issue for MMIO stores since they aren't
 498          * globally visible. It could be an issue for MMIO loads to
 499          * a certain extent but we'll ignore it for now.
 500          */
 501
 502         vcpu->arch.paddr_accessed = gpa;
 503         return kvmppc_emulate_mmio(run, vcpu);
 504 }
 505
 506 int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 507                                 unsigned long ea, unsigned long dsisr)
 508 {
 509         struct kvm *kvm = vcpu->kvm;
 510         unsigned long *hptep, hpte[3], r;
 511         unsigned long mmu_seq, psize, pte_size;
 512         unsigned long gfn, hva, pfn;
 513         struct kvm_memory_slot *memslot;
 514         unsigned long *rmap;
 515         struct revmap_entry *rev;
 516         struct page *page, *pages[1];
 517         long index, ret, npages;
 518         unsigned long is_io;
 519         unsigned int writing, write_ok;
 520         struct vm_area_struct *vma;
 521         unsigned long rcbits;
 522
 523         /*
 524          * Real-mode code has already searched the HPT and found the
 525          * entry we're interested in.  Lock the entry and check that
 526          * it hasn't changed.  If it has, just return and re-execute the
 527          * instruction.
 528          */
 529         if (ea != vcpu->arch.pgfault_addr)
 530                 return RESUME_GUEST;
 531         index = vcpu->arch.pgfault_index;
 532         hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 533         rev = &kvm->arch.revmap[index];
 534         preempt_disable();
 535         while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 536                 cpu_relax();
 537         hpte[0] = hptep[0] & ~HPTE_V_HVLOCK;
 538         hpte[1] = hptep[1];
 539         hpte[2] = r = rev->guest_rpte;
 540         asm volatile("lwsync" : : : "memory");
 541         hptep[0] = hpte[0];
 542         preempt_enable();
 543
 544         if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
 545             hpte[1] != vcpu->arch.pgfault_hpte[1])
 546                 return RESUME_GUEST;
 547
 548         /* Translate the logical address and get the page */
 549         psize = hpte_page_size(hpte[0], r);
 550         gfn = hpte_rpn(r, psize);
 551         memslot = gfn_to_memslot(kvm, gfn);
 552
 553         /* No memslot means it's an emulated MMIO region */
 554         if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
 555                 unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
 556                 return kvmppc_hv_emulate_mmio(run, vcpu, gpa,
 557                                               dsisr & DSISR_ISSTORE);
 558         }
 559
 560         if (!kvm->arch.using_mmu_notifiers)
 561                 return -EFAULT;         /* should never get here */
 562
 563         /* used to check for invalidations in progress */
 564         mmu_seq = kvm->mmu_notifier_seq;
 565         smp_rmb();
 566
 567         is_io = 0;
 568         pfn = 0;
 569         page = NULL;
 570         pte_size = PAGE_SIZE;
 571         writing = (dsisr & DSISR_ISSTORE) != 0;
 572         /* If writing != 0, then the HPTE must allow writing, if we get here */
 573         write_ok = writing;
 574         hva = gfn_to_hva_memslot(memslot, gfn);
 575         npages = get_user_pages_fast(hva, 1, writing, pages);
 576         if (npages < 1) {
 577                 /* Check if it's an I/O mapping */
 578                 down_read(&current->mm->mmap_sem);
 579                 vma = find_vma(current->mm, hva);
 580                 if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
 581                     (vma->vm_flags & VM_PFNMAP)) {
 582                         pfn = vma->vm_pgoff +
 583                                 ((hva - vma->vm_start) >> PAGE_SHIFT);
 584                         pte_size = psize;
 585                         is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
 586                         write_ok = vma->vm_flags & VM_WRITE;
 587                 }
 588                 up_read(&current->mm->mmap_sem);
 589                 if (!pfn)
 590                         return -EFAULT;
 591         } else {
 592                 page = pages[0];
 593                 if (PageHuge(page)) {
 594                         page = compound_head(page);
 595                         pte_size <<= compound_order(page);
 596                 }
 597                 /* if the guest wants write access, see if that is OK */
 598                 if (!writing && hpte_is_writable(r)) {
 599                         pte_t *ptep, pte;
 600
 601                         /*
 602                          * We need to protect against page table destruction
 603                          * while looking up and updating the pte.
 604                          */
 605                         rcu_read_lock_sched();
 606                         ptep = find_linux_pte_or_hugepte(current->mm->pgd,
 607                                                          hva, NULL);
 608                         if (ptep && pte_present(*ptep)) {
 609                                 pte = kvmppc_read_update_linux_pte(ptep, 1);
 610                                 if (pte_write(pte))
 611                                         write_ok = 1;
 612                         }
 613                         rcu_read_unlock_sched();
 614                 }
 615                 pfn = page_to_pfn(page);
 616         }
 617
 618         ret = -EFAULT;
 619         if (psize > pte_size)
 620                 goto out_put;
 621
 622         /* Check WIMG vs. the actual page we're accessing */
 623         if (!hpte_cache_flags_ok(r, is_io)) {
 624                 if (is_io)
 625                         return -EFAULT;
 626                 /*
 627                  * Allow guest to map emulated device memory as
 628                  * uncacheable, but actually make it cacheable.
 629                  */
 630                 r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
 631         }
 632
 633         /* Set the HPTE to point to pfn */
 634         r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT);
 635         if (hpte_is_writable(r) && !write_ok)
 636                 r = hpte_make_readonly(r);
 637         ret = RESUME_GUEST;
 638         preempt_disable();
 639         while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 640                 cpu_relax();
 641         if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] ||
 642             rev->guest_rpte != hpte[2])
 643                 /* HPTE has been changed under us; let the guest retry */
 644                 goto out_unlock;
 645         hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 646
 647         rmap = &memslot->rmap[gfn - memslot->base_gfn];
 648         lock_rmap(rmap);
 649
 650         /* Check if we might have been invalidated; let the guest retry if so */
 651         ret = RESUME_GUEST;
 652         if (mmu_notifier_retry(vcpu, mmu_seq)) {
 653                 unlock_rmap(rmap);
 654                 goto out_unlock;
 655         }
 656
 657         /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
 658         rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
 659         r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
 660
 661         if (hptep[0] & HPTE_V_VALID) {
 662                 /* HPTE was previously valid, so we need to invalidate it */
 663                 unlock_rmap(rmap);
 664                 hptep[0] |= HPTE_V_ABSENT;
 665                 kvmppc_invalidate_hpte(kvm, hptep, index);
 666                 /* don't lose previous R and C bits */
 667                 r |= hptep[1] & (HPTE_R_R | HPTE_R_C);
 668         } else {
 669                 kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
 670         }
 671
 672         hptep[1] = r;
 673         eieio();
 674         hptep[0] = hpte[0];
 675         asm volatile("ptesync" : : : "memory");
 676         preempt_enable();
 677         if (page && hpte_is_writable(r))
 678                 SetPageDirty(page);
 679
 680  out_put:
 681         if (page)
 682                 put_page(page);
 683         return ret;
 684
 685  out_unlock:
 686         hptep[0] &= ~HPTE_V_HVLOCK;
 687         preempt_enable();
 688         goto out_put;
 689 }
 690
 691 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 692                           int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 693                                          unsigned long gfn))
 694 {
 695         int ret;
 696         int retval = 0;
 697         struct kvm_memslots *slots;
 698         struct kvm_memory_slot *memslot;
 699
 700         slots = kvm_memslots(kvm);
 701         kvm_for_each_memslot(memslot, slots) {
 702                 unsigned long start = memslot->userspace_addr;
 703                 unsigned long end;
 704
 705                 end = start + (memslot->npages << PAGE_SHIFT);
 706                 if (hva >= start && hva < end) {
 707                         gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
 708
 709                         ret = handler(kvm, &memslot->rmap[gfn_offset],
 710                                       memslot->base_gfn + gfn_offset);
 711                         retval |= ret;
 712                 }
 713         }
 714
 715         return retval;
 716 }
 717
 718 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 719                            unsigned long gfn)
 720 {
 721         struct revmap_entry *rev = kvm->arch.revmap;
 722         unsigned long h, i, j;
 723         unsigned long *hptep;
 724         unsigned long ptel, psize, rcbits;
 725
 726         for (;;) {
 727                 lock_rmap(rmapp);
 728                 if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 729                         unlock_rmap(rmapp);
 730                         break;
 731                 }
 732
 733                 /*
 734                  * To avoid an ABBA deadlock with the HPTE lock bit,
 735                  * we can't spin on the HPTE lock while holding the
 736                  * rmap chain lock.
 737                  */
 738                 i = *rmapp & KVMPPC_RMAP_INDEX;
 739                 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 740                 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 741                         /* unlock rmap before spinning on the HPTE lock */
 742                         unlock_rmap(rmapp);
 743                         while (hptep[0] & HPTE_V_HVLOCK)
 744                                 cpu_relax();
 745                         continue;
 746                 }
 747                 j = rev[i].forw;
 748                 if (j == i) {
 749                         /* chain is now empty */
 750                         *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
 751                 } else {
 752                         /* remove i from chain */
 753                         h = rev[i].back;
 754                         rev[h].forw = j;
 755                         rev[j].back = h;
 756                         rev[i].forw = rev[i].back = i;
 757                         *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
 758                 }
 759
 760                 /* Now check and modify the HPTE */
 761                 ptel = rev[i].guest_rpte;
 762                 psize = hpte_page_size(hptep[0], ptel);
 763                 if ((hptep[0] & HPTE_V_VALID) &&
 764                     hpte_rpn(ptel, psize) == gfn) {
 765                         hptep[0] |= HPTE_V_ABSENT;
 766                         kvmppc_invalidate_hpte(kvm, hptep, i);
 767                         /* Harvest R and C */
 768                         rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
 769                         *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 770                         rev[i].guest_rpte = ptel | rcbits;
 771                 }
 772                 unlock_rmap(rmapp);
 773                 hptep[0] &= ~HPTE_V_HVLOCK;
 774         }
 775         return 0;
 776 }
 777
 778 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
 779 {
 780         if (kvm->arch.using_mmu_notifiers)
 781                 kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 782         return 0;
 783 }
 784
 785 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 786                          unsigned long gfn)
 787 {
 788         struct revmap_entry *rev = kvm->arch.revmap;
 789         unsigned long head, i, j;
 790         unsigned long *hptep;
 791         int ret = 0;
 792
 793  retry:
 794         lock_rmap(rmapp);
 795         if (*rmapp & KVMPPC_RMAP_REFERENCED) {
 796                 *rmapp &= ~KVMPPC_RMAP_REFERENCED;
 797                 ret = 1;
 798         }
 799         if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 800                 unlock_rmap(rmapp);
 801                 return ret;
 802         }
 803
 804         i = head = *rmapp & KVMPPC_RMAP_INDEX;
 805         do {
 806                 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 807                 j = rev[i].forw;
 808
 809                 /* If this HPTE isn't referenced, ignore it */
 810                 if (!(hptep[1] & HPTE_R_R))
 811                         continue;
 812
 813                 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 814                         /* unlock rmap before spinning on the HPTE lock */
 815                         unlock_rmap(rmapp);
 816                         while (hptep[0] & HPTE_V_HVLOCK)
 817                                 cpu_relax();
 818                         goto retry;
 819                 }
 820
 821                 /* Now check and modify the HPTE */
 822                 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) {
 823                         kvmppc_clear_ref_hpte(kvm, hptep, i);
 824                         rev[i].guest_rpte |= HPTE_R_R;
 825                         ret = 1;
 826                 }
 827                 hptep[0] &= ~HPTE_V_HVLOCK;
 828         } while ((i = j) != head);
 829
 830         unlock_rmap(rmapp);
 831         return ret;
 832 }
 833
 834 int kvm_age_hva(struct kvm *kvm, unsigned long hva)
 835 {
 836         if (!kvm->arch.using_mmu_notifiers)
 837                 return 0;
 838         return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
 839 }
 840
 841 static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 842                               unsigned long gfn)
 843 {
 844         struct revmap_entry *rev = kvm->arch.revmap;
 845         unsigned long head, i, j;
 846         unsigned long *hp;
 847         int ret = 1;
 848
 849         if (*rmapp & KVMPPC_RMAP_REFERENCED)
 850                 return 1;
 851
 852         lock_rmap(rmapp);
 853         if (*rmapp & KVMPPC_RMAP_REFERENCED)
 854                 goto out;
 855
 856         if (*rmapp & KVMPPC_RMAP_PRESENT) {
 857                 i = head = *rmapp & KVMPPC_RMAP_INDEX;
 858                 do {
 859                         hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
 860                         j = rev[i].forw;
 861                         if (hp[1] & HPTE_R_R)
 862                                 goto out;
 863                 } while ((i = j) != head);
 864         }
 865         ret = 0;
 866
 867  out:
 868         unlock_rmap(rmapp);
 869         return ret;
 870 }
 871
 872 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
 873 {
 874         if (!kvm->arch.using_mmu_notifiers)
 875                 return 0;
 876         return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
 877 }
 878
 879 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
 880 {
 881         if (!kvm->arch.using_mmu_notifiers)
 882                 return;
 883         kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 884 }
 885
 886 static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
 887 {
 888         struct revmap_entry *rev = kvm->arch.revmap;
 889         unsigned long head, i, j;
 890         unsigned long *hptep;
 891         int ret = 0;
 892
 893  retry:
 894         lock_rmap(rmapp);
 895         if (*rmapp & KVMPPC_RMAP_CHANGED) {
 896                 *rmapp &= ~KVMPPC_RMAP_CHANGED;
 897                 ret = 1;
 898         }
 899         if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 900                 unlock_rmap(rmapp);
 901                 return ret;
 902         }
 903
 904         i = head = *rmapp & KVMPPC_RMAP_INDEX;
 905         do {
 906                 hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
 907                 j = rev[i].forw;
 908
 909                 if (!(hptep[1] & HPTE_R_C))
 910                         continue;
 911
 912                 if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 913                         /* unlock rmap before spinning on the HPTE lock */
 914                         unlock_rmap(rmapp);
 915                         while (hptep[0] & HPTE_V_HVLOCK)
 916                                 cpu_relax();
 917                         goto retry;
 918                 }
 919
 920                 /* Now check and modify the HPTE */
 921                 if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
 922                         /* need to make it temporarily absent to clear C */
 923                         hptep[0] |= HPTE_V_ABSENT;
 924                         kvmppc_invalidate_hpte(kvm, hptep, i);
 925                         hptep[1] &= ~HPTE_R_C;
 926                         eieio();
 927                         hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 928                         rev[i].guest_rpte |= HPTE_R_C;
 929                         ret = 1;
 930                 }
 931                 hptep[0] &= ~HPTE_V_HVLOCK;
 932         } while ((i = j) != head);
 933
 934         unlock_rmap(rmapp);
 935         return ret;
 936 }
 937
 938 long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
 939 {
 940         unsigned long i;
 941         unsigned long *rmapp, *map;
 942
 943         preempt_disable();
 944         rmapp = memslot->rmap;
 945         map = memslot->dirty_bitmap;
 946         for (i = 0; i < memslot->npages; ++i) {
 947                 if (kvm_test_clear_dirty(kvm, rmapp))
 948                         __set_bit_le(i, map);
 949                 ++rmapp;
 950         }
 951         preempt_enable();
 952         return 0;
 953 }
 954
 955 void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
 956                             unsigned long *nb_ret)
 957 {
 958         struct kvm_memory_slot *memslot;
 959         unsigned long gfn = gpa >> PAGE_SHIFT;
 960         struct page *page, *pages[1];
 961         int npages;
 962         unsigned long hva, psize, offset;
 963         unsigned long pa;
 964         unsigned long *physp;
 965
 966         memslot = gfn_to_memslot(kvm, gfn);
 967         if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 968                 return NULL;
 969         if (!kvm->arch.using_mmu_notifiers) {
 970                 physp = kvm->arch.slot_phys[memslot->id];
 971                 if (!physp)
 972                         return NULL;
 973                 physp += gfn - memslot->base_gfn;
 974                 pa = *physp;
 975                 if (!pa) {
 976                         if (kvmppc_get_guest_page(kvm, gfn, memslot,
 977                                                   PAGE_SIZE) < 0)
 978                                 return NULL;
 979                         pa = *physp;
 980                 }
 981                 page = pfn_to_page(pa >> PAGE_SHIFT);
 982         } else {
 983                 hva = gfn_to_hva_memslot(memslot, gfn);
 984                 npages = get_user_pages_fast(hva, 1, 1, pages);
 985                 if (npages < 1)
 986                         return NULL;
 987                 page = pages[0];
 988         }
 989         psize = PAGE_SIZE;
 990         if (PageHuge(page)) {
 991                 page = compound_head(page);
 992                 psize <<= compound_order(page);
 993         }
 994         if (!kvm->arch.using_mmu_notifiers)
 995                 get_page(page);
 996         offset = gpa & (psize - 1);
 997         if (nb_ret)
 998                 *nb_ret = psize - offset;
 999         return page_address(page) + offset;
1000 }
1001
1002 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
1003 {
1004         struct page *page = virt_to_page(va);
1005
1006         page = compound_head(page);
1007         put_page(page);
1008 }
1009
1010 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1011 {
1012         struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
1013
1014         if (cpu_has_feature(CPU_FTR_ARCH_206))
1015                 vcpu->arch.slb_nr = 32;         /* POWER7 */
1016         else
1017                 vcpu->arch.slb_nr = 64;
1018
1019         mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
1020         mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
1021
1022         vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
1023 }