Pileus Git - ~andy/linux/blob - drivers/vfio/vfio_iommu_type1.c

   1 /*
   2  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   3  *
   4  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   5  *     Author: Alex Williamson <alex.williamson@redhat.com>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License version 2 as
   9  * published by the Free Software Foundation.
  10  *
  11  * Derived from original vfio:
  12  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  13  * Author: Tom Lyon, pugs@cisco.com
  14  *
  15  * We arbitrarily define a Type1 IOMMU as one matching the below code.
  16  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  17  * VT-d, but that makes it harder to re-use as theoretically anyone
  18  * implementing a similar IOMMU could make use of this.  We expect the
  19  * IOMMU to support the IOMMU API and have few to no restrictions around
  20  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  21  * optimized for relatively static mappings of a userspace process with
  22  * userpsace pages pinned into memory.  We also assume devices and IOMMU
  23  * domains are PCI based as the IOMMU API is still centered around a
  24  * device/bus interface rather than a group interface.
  25  */
  26
  27 #include <linux/compat.h>
  28 #include <linux/device.h>
  29 #include <linux/fs.h>
  30 #include <linux/iommu.h>
  31 #include <linux/module.h>
  32 #include <linux/mm.h>
  33 #include <linux/pci.h>          /* pci_bus_type */
  34 #include <linux/rbtree.h>
  35 #include <linux/sched.h>
  36 #include <linux/slab.h>
  37 #include <linux/uaccess.h>
  38 #include <linux/vfio.h>
  39 #include <linux/workqueue.h>
  40
  41 #define DRIVER_VERSION  "0.2"
  42 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  43 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  44
  45 static bool allow_unsafe_interrupts;
  46 module_param_named(allow_unsafe_interrupts,
  47                    allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  48 MODULE_PARM_DESC(allow_unsafe_interrupts,
  49                  "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  50
  51 static bool disable_hugepages;
  52 module_param_named(disable_hugepages,
  53                    disable_hugepages, bool, S_IRUGO | S_IWUSR);
  54 MODULE_PARM_DESC(disable_hugepages,
  55                  "Disable VFIO IOMMU support for IOMMU hugepages.");
  56
  57 struct vfio_iommu {
  58         struct iommu_domain     *domain;
  59         struct mutex            lock;
  60         struct rb_root          dma_list;
  61         struct list_head        group_list;
  62         bool                    cache;
  63 };
  64
  65 struct vfio_dma {
  66         struct rb_node          node;
  67         dma_addr_t              iova;           /* Device address */
  68         unsigned long           vaddr;          /* Process virtual addr */
  69         size_t                  size;           /* Map size (bytes) */
  70         int                     prot;           /* IOMMU_READ/WRITE */
  71 };
  72
  73 struct vfio_group {
  74         struct iommu_group      *iommu_group;
  75         struct list_head        next;
  76 };
  77
  78 /*
  79  * This code handles mapping and unmapping of user data buffers
  80  * into DMA'ble space using the IOMMU
  81  */
  82
  83 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
  84                                       dma_addr_t start, size_t size)
  85 {
  86         struct rb_node *node = iommu->dma_list.rb_node;
  87
  88         while (node) {
  89                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
  90
  91                 if (start + size <= dma->iova)
  92                         node = node->rb_left;
  93                 else if (start >= dma->iova + dma->size)
  94                         node = node->rb_right;
  95                 else
  96                         return dma;
  97         }
  98
  99         return NULL;
 100 }
 101
 102 static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 103 {
 104         struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 105         struct vfio_dma *dma;
 106
 107         while (*link) {
 108                 parent = *link;
 109                 dma = rb_entry(parent, struct vfio_dma, node);
 110
 111                 if (new->iova + new->size <= dma->iova)
 112                         link = &(*link)->rb_left;
 113                 else
 114                         link = &(*link)->rb_right;
 115         }
 116
 117         rb_link_node(&new->node, parent, link);
 118         rb_insert_color(&new->node, &iommu->dma_list);
 119 }
 120
 121 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 122 {
 123         rb_erase(&old->node, &iommu->dma_list);
 124 }
 125
 126 struct vwork {
 127         struct mm_struct        *mm;
 128         long                    npage;
 129         struct work_struct      work;
 130 };
 131
 132 /* delayed decrement/increment for locked_vm */
 133 static void vfio_lock_acct_bg(struct work_struct *work)
 134 {
 135         struct vwork *vwork = container_of(work, struct vwork, work);
 136         struct mm_struct *mm;
 137
 138         mm = vwork->mm;
 139         down_write(&mm->mmap_sem);
 140         mm->locked_vm += vwork->npage;
 141         up_write(&mm->mmap_sem);
 142         mmput(mm);
 143         kfree(vwork);
 144 }
 145
 146 static void vfio_lock_acct(long npage)
 147 {
 148         struct vwork *vwork;
 149         struct mm_struct *mm;
 150
 151         if (!current->mm || !npage)
 152                 return; /* process exited or nothing to do */
 153
 154         if (down_write_trylock(&current->mm->mmap_sem)) {
 155                 current->mm->locked_vm += npage;
 156                 up_write(&current->mm->mmap_sem);
 157                 return;
 158         }
 159
 160         /*
 161          * Couldn't get mmap_sem lock, so must setup to update
 162          * mm->locked_vm later. If locked_vm were atomic, we
 163          * wouldn't need this silliness
 164          */
 165         vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
 166         if (!vwork)
 167                 return;
 168         mm = get_task_mm(current);
 169         if (!mm) {
 170                 kfree(vwork);
 171                 return;
 172         }
 173         INIT_WORK(&vwork->work, vfio_lock_acct_bg);
 174         vwork->mm = mm;
 175         vwork->npage = npage;
 176         schedule_work(&vwork->work);
 177 }
 178
 179 /*
 180  * Some mappings aren't backed by a struct page, for example an mmap'd
 181  * MMIO range for our own or another device.  These use a different
 182  * pfn conversion and shouldn't be tracked as locked pages.
 183  */
 184 static bool is_invalid_reserved_pfn(unsigned long pfn)
 185 {
 186         if (pfn_valid(pfn)) {
 187                 bool reserved;
 188                 struct page *tail = pfn_to_page(pfn);
 189                 struct page *head = compound_trans_head(tail);
 190                 reserved = !!(PageReserved(head));
 191                 if (head != tail) {
 192                         /*
 193                          * "head" is not a dangling pointer
 194                          * (compound_trans_head takes care of that)
 195                          * but the hugepage may have been split
 196                          * from under us (and we may not hold a
 197                          * reference count on the head page so it can
 198                          * be reused before we run PageReferenced), so
 199                          * we've to check PageTail before returning
 200                          * what we just read.
 201                          */
 202                         smp_rmb();
 203                         if (PageTail(tail))
 204                                 return reserved;
 205                 }
 206                 return PageReserved(tail);
 207         }
 208
 209         return true;
 210 }
 211
 212 static int put_pfn(unsigned long pfn, int prot)
 213 {
 214         if (!is_invalid_reserved_pfn(pfn)) {
 215                 struct page *page = pfn_to_page(pfn);
 216                 if (prot & IOMMU_WRITE)
 217                         SetPageDirty(page);
 218                 put_page(page);
 219                 return 1;
 220         }
 221         return 0;
 222 }
 223
 224 static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 225 {
 226         struct page *page[1];
 227         struct vm_area_struct *vma;
 228         int ret = -EFAULT;
 229
 230         if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) {
 231                 *pfn = page_to_pfn(page[0]);
 232                 return 0;
 233         }
 234
 235         down_read(&current->mm->mmap_sem);
 236
 237         vma = find_vma_intersection(current->mm, vaddr, vaddr + 1);
 238
 239         if (vma && vma->vm_flags & VM_PFNMAP) {
 240                 *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 241                 if (is_invalid_reserved_pfn(*pfn))
 242                         ret = 0;
 243         }
 244
 245         up_read(&current->mm->mmap_sem);
 246
 247         return ret;
 248 }
 249
 250 /*
 251  * Attempt to pin pages.  We really don't want to track all the pfns and
 252  * the iommu can only map chunks of consecutive pfns anyway, so get the
 253  * first page and all consecutive pages with the same locking.
 254  */
 255 static long vfio_pin_pages(unsigned long vaddr, long npage,
 256                            int prot, unsigned long *pfn_base)
 257 {
 258         unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 259         bool lock_cap = capable(CAP_IPC_LOCK);
 260         long ret, i;
 261
 262         if (!current->mm)
 263                 return -ENODEV;
 264
 265         ret = vaddr_get_pfn(vaddr, prot, pfn_base);
 266         if (ret)
 267                 return ret;
 268
 269         if (is_invalid_reserved_pfn(*pfn_base))
 270                 return 1;
 271
 272         if (!lock_cap && current->mm->locked_vm + 1 > limit) {
 273                 put_pfn(*pfn_base, prot);
 274                 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 275                         limit << PAGE_SHIFT);
 276                 return -ENOMEM;
 277         }
 278
 279         if (unlikely(disable_hugepages)) {
 280                 vfio_lock_acct(1);
 281                 return 1;
 282         }
 283
 284         /* Lock all the consecutive pages from pfn_base */
 285         for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 286                 unsigned long pfn = 0;
 287
 288                 ret = vaddr_get_pfn(vaddr, prot, &pfn);
 289                 if (ret)
 290                         break;
 291
 292                 if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
 293                         put_pfn(pfn, prot);
 294                         break;
 295                 }
 296
 297                 if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
 298                         put_pfn(pfn, prot);
 299                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 300                                 __func__, limit << PAGE_SHIFT);
 301                         break;
 302                 }
 303         }
 304
 305         vfio_lock_acct(i);
 306
 307         return i;
 308 }
 309
 310 static long vfio_unpin_pages(unsigned long pfn, long npage,
 311                              int prot, bool do_accounting)
 312 {
 313         unsigned long unlocked = 0;
 314         long i;
 315
 316         for (i = 0; i < npage; i++)
 317                 unlocked += put_pfn(pfn++, prot);
 318
 319         if (do_accounting)
 320                 vfio_lock_acct(-unlocked);
 321
 322         return unlocked;
 323 }
 324
 325 static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 326                             dma_addr_t iova, size_t *size)
 327 {
 328         dma_addr_t start = iova, end = iova + *size;
 329         long unlocked = 0;
 330
 331         while (iova < end) {
 332                 size_t unmapped;
 333                 phys_addr_t phys;
 334
 335                 /*
 336                  * We use the IOMMU to track the physical address.  This
 337                  * saves us from having a lot more entries in our mapping
 338                  * tree.  The downside is that we don't track the size
 339                  * used to do the mapping.  We request unmap of a single
 340                  * page, but expect IOMMUs that support large pages to
 341                  * unmap a larger chunk.
 342                  */
 343                 phys = iommu_iova_to_phys(iommu->domain, iova);
 344                 if (WARN_ON(!phys)) {
 345                         iova += PAGE_SIZE;
 346                         continue;
 347                 }
 348
 349                 unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 350                 if (!unmapped)
 351                         break;
 352
 353                 unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
 354                                              unmapped >> PAGE_SHIFT,
 355                                              dma->prot, false);
 356                 iova += unmapped;
 357         }
 358
 359         vfio_lock_acct(-unlocked);
 360
 361         *size = iova - start;
 362
 363         return 0;
 364 }
 365
 366 static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
 367                                    size_t *size, struct vfio_dma *dma)
 368 {
 369         size_t offset, overlap, tmp;
 370         struct vfio_dma *split;
 371         int ret;
 372
 373         if (!*size)
 374                 return 0;
 375
 376         /*
 377          * Existing dma region is completely covered, unmap all.  This is
 378          * the likely case since userspace tends to map and unmap buffers
 379          * in one shot rather than multiple mappings within a buffer.
 380          */
 381         if (likely(start <= dma->iova &&
 382                    start + *size >= dma->iova + dma->size)) {
 383                 *size = dma->size;
 384                 ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
 385                 if (ret)
 386                         return ret;
 387
 388                 /*
 389                  * Did we remove more than we have?  Should never happen
 390                  * since a vfio_dma is contiguous in iova and vaddr.
 391                  */
 392                 WARN_ON(*size != dma->size);
 393
 394                 vfio_remove_dma(iommu, dma);
 395                 kfree(dma);
 396                 return 0;
 397         }
 398
 399         /* Overlap low address of existing range */
 400         if (start <= dma->iova) {
 401                 overlap = start + *size - dma->iova;
 402                 ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
 403                 if (ret)
 404                         return ret;
 405
 406                 vfio_remove_dma(iommu, dma);
 407
 408                 /*
 409                  * Check, we may have removed to whole vfio_dma.  If not
 410                  * fixup and re-insert.
 411                  */
 412                 if (overlap < dma->size) {
 413                         dma->iova += overlap;
 414                         dma->vaddr += overlap;
 415                         dma->size -= overlap;
 416                         vfio_insert_dma(iommu, dma);
 417                 } else
 418                         kfree(dma);
 419
 420                 *size = overlap;
 421                 return 0;
 422         }
 423
 424         /* Overlap high address of existing range */
 425         if (start + *size >= dma->iova + dma->size) {
 426                 offset = start - dma->iova;
 427                 overlap = dma->size - offset;
 428
 429                 ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
 430                 if (ret)
 431                         return ret;
 432
 433                 dma->size -= overlap;
 434                 *size = overlap;
 435                 return 0;
 436         }
 437
 438         /* Split existing */
 439
 440         /*
 441          * Allocate our tracking structure early even though it may not
 442          * be used.  An Allocation failure later loses track of pages and
 443          * is more difficult to unwind.
 444          */
 445         split = kzalloc(sizeof(*split), GFP_KERNEL);
 446         if (!split)
 447                 return -ENOMEM;
 448
 449         offset = start - dma->iova;
 450
 451         ret = vfio_unmap_unpin(iommu, dma, start, size);
 452         if (ret || !*size) {
 453                 kfree(split);
 454                 return ret;
 455         }
 456
 457         tmp = dma->size;
 458
 459         /* Resize the lower vfio_dma in place, before the below insert */
 460         dma->size = offset;
 461
 462         /* Insert new for remainder, assuming it didn't all get unmapped */
 463         if (likely(offset + *size < tmp)) {
 464                 split->size = tmp - offset - *size;
 465                 split->iova = dma->iova + offset + *size;
 466                 split->vaddr = dma->vaddr + offset + *size;
 467                 split->prot = dma->prot;
 468                 vfio_insert_dma(iommu, split);
 469         } else
 470                 kfree(split);
 471
 472         return 0;
 473 }
 474
 475 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 476                              struct vfio_iommu_type1_dma_unmap *unmap)
 477 {
 478         uint64_t mask;
 479         struct vfio_dma *dma;
 480         size_t unmapped = 0, size;
 481         int ret = 0;
 482
 483         mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 484
 485         if (unmap->iova & mask)
 486                 return -EINVAL;
 487         if (!unmap->size || unmap->size & mask)
 488                 return -EINVAL;
 489
 490         WARN_ON(mask & PAGE_MASK);
 491
 492         mutex_lock(&iommu->lock);
 493
 494         while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
 495                 size = unmap->size;
 496                 ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
 497                 if (ret || !size)
 498                         break;
 499                 unmapped += size;
 500         }
 501
 502         mutex_unlock(&iommu->lock);
 503
 504         /*
 505          * We may unmap more than requested, update the unmap struct so
 506          * userspace can know.
 507          */
 508         unmap->size = unmapped;
 509
 510         return ret;
 511 }
 512
 513 /*
 514  * Turns out AMD IOMMU has a page table bug where it won't map large pages
 515  * to a region that previously mapped smaller pages.  This should be fixed
 516  * soon, so this is just a temporary workaround to break mappings down into
 517  * PAGE_SIZE.  Better to map smaller pages than nothing.
 518  */
 519 static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
 520                           unsigned long pfn, long npage, int prot)
 521 {
 522         long i;
 523         int ret;
 524
 525         for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
 526                 ret = iommu_map(iommu->domain, iova,
 527                                 (phys_addr_t)pfn << PAGE_SHIFT,
 528                                 PAGE_SIZE, prot);
 529                 if (ret)
 530                         break;
 531         }
 532
 533         for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
 534                 iommu_unmap(iommu->domain, iova, PAGE_SIZE);
 535
 536         return ret;
 537 }
 538
 539 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 540                            struct vfio_iommu_type1_dma_map *map)
 541 {
 542         dma_addr_t end, iova;
 543         unsigned long vaddr = map->vaddr;
 544         size_t size = map->size;
 545         long npage;
 546         int ret = 0, prot = 0;
 547         uint64_t mask;
 548         struct vfio_dma *dma = NULL;
 549         unsigned long pfn;
 550
 551         end = map->iova + map->size;
 552
 553         mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
 554
 555         /* READ/WRITE from device perspective */
 556         if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
 557                 prot |= IOMMU_WRITE;
 558         if (map->flags & VFIO_DMA_MAP_FLAG_READ)
 559                 prot |= IOMMU_READ;
 560
 561         if (!prot)
 562                 return -EINVAL; /* No READ/WRITE? */
 563
 564         if (iommu->cache)
 565                 prot |= IOMMU_CACHE;
 566
 567         if (vaddr & mask)
 568                 return -EINVAL;
 569         if (map->iova & mask)
 570                 return -EINVAL;
 571         if (!map->size || map->size & mask)
 572                 return -EINVAL;
 573
 574         WARN_ON(mask & PAGE_MASK);
 575
 576         /* Don't allow IOVA wrap */
 577         if (end && end < map->iova)
 578                 return -EINVAL;
 579
 580         /* Don't allow virtual address wrap */
 581         if (vaddr + map->size && vaddr + map->size < vaddr)
 582                 return -EINVAL;
 583
 584         mutex_lock(&iommu->lock);
 585
 586         if (vfio_find_dma(iommu, map->iova, map->size)) {
 587                 mutex_unlock(&iommu->lock);
 588                 return -EEXIST;
 589         }
 590
 591         for (iova = map->iova; iova < end; iova += size, vaddr += size) {
 592                 long i;
 593
 594                 /* Pin a contiguous chunk of memory */
 595                 npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
 596                                        prot, &pfn);
 597                 if (npage <= 0) {
 598                         WARN_ON(!npage);
 599                         ret = (int)npage;
 600                         goto out;
 601                 }
 602
 603                 /* Verify pages are not already mapped */
 604                 for (i = 0; i < npage; i++) {
 605                         if (iommu_iova_to_phys(iommu->domain,
 606                                                iova + (i << PAGE_SHIFT))) {
 607                                 ret = -EBUSY;
 608                                 goto out_unpin;
 609                         }
 610                 }
 611
 612                 ret = iommu_map(iommu->domain, iova,
 613                                 (phys_addr_t)pfn << PAGE_SHIFT,
 614                                 npage << PAGE_SHIFT, prot);
 615                 if (ret) {
 616                         if (ret != -EBUSY ||
 617                             map_try_harder(iommu, iova, pfn, npage, prot)) {
 618                                 goto out_unpin;
 619                         }
 620                 }
 621
 622                 size = npage << PAGE_SHIFT;
 623
 624                 /*
 625                  * Check if we abut a region below - nothing below 0.
 626                  * This is the most likely case when mapping chunks of
 627                  * physically contiguous regions within a virtual address
 628                  * range.  Update the abutting entry in place since iova
 629                  * doesn't change.
 630                  */
 631                 if (likely(iova)) {
 632                         struct vfio_dma *tmp;
 633                         tmp = vfio_find_dma(iommu, iova - 1, 1);
 634                         if (tmp && tmp->prot == prot &&
 635                             tmp->vaddr + tmp->size == vaddr) {
 636                                 tmp->size += size;
 637                                 iova = tmp->iova;
 638                                 size = tmp->size;
 639                                 vaddr = tmp->vaddr;
 640                                 dma = tmp;
 641                         }
 642                 }
 643
 644                 /*
 645                  * Check if we abut a region above - nothing above ~0 + 1.
 646                  * If we abut above and below, remove and free.  If only
 647                  * abut above, remove, modify, reinsert.
 648                  */
 649                 if (likely(iova + size)) {
 650                         struct vfio_dma *tmp;
 651                         tmp = vfio_find_dma(iommu, iova + size, 1);
 652                         if (tmp && tmp->prot == prot &&
 653                             tmp->vaddr == vaddr + size) {
 654                                 vfio_remove_dma(iommu, tmp);
 655                                 if (dma) {
 656                                         dma->size += tmp->size;
 657                                         kfree(tmp);
 658                                 } else {
 659                                         size += tmp->size;
 660                                         tmp->size = size;
 661                                         tmp->iova = iova;
 662                                         tmp->vaddr = vaddr;
 663                                         vfio_insert_dma(iommu, tmp);
 664                                         dma = tmp;
 665                                 }
 666                         }
 667                 }
 668
 669                 if (!dma) {
 670                         dma = kzalloc(sizeof(*dma), GFP_KERNEL);
 671                         if (!dma) {
 672                                 iommu_unmap(iommu->domain, iova, size);
 673                                 ret = -ENOMEM;
 674                                 goto out_unpin;
 675                         }
 676
 677                         dma->size = size;
 678                         dma->iova = iova;
 679                         dma->vaddr = vaddr;
 680                         dma->prot = prot;
 681                         vfio_insert_dma(iommu, dma);
 682                 }
 683         }
 684
 685         WARN_ON(ret);
 686         mutex_unlock(&iommu->lock);
 687         return ret;
 688
 689 out_unpin:
 690         vfio_unpin_pages(pfn, npage, prot, true);
 691
 692 out:
 693         iova = map->iova;
 694         size = map->size;
 695         while ((dma = vfio_find_dma(iommu, iova, size))) {
 696                 int r = vfio_remove_dma_overlap(iommu, iova,
 697                                                 &size, dma);
 698                 if (WARN_ON(r || !size))
 699                         break;
 700         }
 701
 702         mutex_unlock(&iommu->lock);
 703         return ret;
 704 }
 705
 706 static int vfio_iommu_type1_attach_group(void *iommu_data,
 707                                          struct iommu_group *iommu_group)
 708 {
 709         struct vfio_iommu *iommu = iommu_data;
 710         struct vfio_group *group, *tmp;
 711         int ret;
 712
 713         group = kzalloc(sizeof(*group), GFP_KERNEL);
 714         if (!group)
 715                 return -ENOMEM;
 716
 717         mutex_lock(&iommu->lock);
 718
 719         list_for_each_entry(tmp, &iommu->group_list, next) {
 720                 if (tmp->iommu_group == iommu_group) {
 721                         mutex_unlock(&iommu->lock);
 722                         kfree(group);
 723                         return -EINVAL;
 724                 }
 725         }
 726
 727         /*
 728          * TODO: Domain have capabilities that might change as we add
 729          * groups (see iommu->cache, currently never set).  Check for
 730          * them and potentially disallow groups to be attached when it
 731          * would change capabilities (ugh).
 732          */
 733         ret = iommu_attach_group(iommu->domain, iommu_group);
 734         if (ret) {
 735                 mutex_unlock(&iommu->lock);
 736                 kfree(group);
 737                 return ret;
 738         }
 739
 740         group->iommu_group = iommu_group;
 741         list_add(&group->next, &iommu->group_list);
 742
 743         mutex_unlock(&iommu->lock);
 744
 745         return 0;
 746 }
 747
 748 static void vfio_iommu_type1_detach_group(void *iommu_data,
 749                                           struct iommu_group *iommu_group)
 750 {
 751         struct vfio_iommu *iommu = iommu_data;
 752         struct vfio_group *group;
 753
 754         mutex_lock(&iommu->lock);
 755
 756         list_for_each_entry(group, &iommu->group_list, next) {
 757                 if (group->iommu_group == iommu_group) {
 758                         iommu_detach_group(iommu->domain, iommu_group);
 759                         list_del(&group->next);
 760                         kfree(group);
 761                         break;
 762                 }
 763         }
 764
 765         mutex_unlock(&iommu->lock);
 766 }
 767
 768 static void *vfio_iommu_type1_open(unsigned long arg)
 769 {
 770         struct vfio_iommu *iommu;
 771
 772         if (arg != VFIO_TYPE1_IOMMU)
 773                 return ERR_PTR(-EINVAL);
 774
 775         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 776         if (!iommu)
 777                 return ERR_PTR(-ENOMEM);
 778
 779         INIT_LIST_HEAD(&iommu->group_list);
 780         iommu->dma_list = RB_ROOT;
 781         mutex_init(&iommu->lock);
 782
 783         /*
 784          * Wish we didn't have to know about bus_type here.
 785          */
 786         iommu->domain = iommu_domain_alloc(&pci_bus_type);
 787         if (!iommu->domain) {
 788                 kfree(iommu);
 789                 return ERR_PTR(-EIO);
 790         }
 791
 792         /*
 793          * Wish we could specify required capabilities rather than create
 794          * a domain, see what comes out and hope it doesn't change along
 795          * the way.  Fortunately we know interrupt remapping is global for
 796          * our iommus.
 797          */
 798         if (!allow_unsafe_interrupts &&
 799             !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) {
 800                 pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
 801                        __func__);
 802                 iommu_domain_free(iommu->domain);
 803                 kfree(iommu);
 804                 return ERR_PTR(-EPERM);
 805         }
 806
 807         return iommu;
 808 }
 809
 810 static void vfio_iommu_type1_release(void *iommu_data)
 811 {
 812         struct vfio_iommu *iommu = iommu_data;
 813         struct vfio_group *group, *group_tmp;
 814         struct rb_node *node;
 815
 816         list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
 817                 iommu_detach_group(iommu->domain, group->iommu_group);
 818                 list_del(&group->next);
 819                 kfree(group);
 820         }
 821
 822         while ((node = rb_first(&iommu->dma_list))) {
 823                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 824                 size_t size = dma->size;
 825                 vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 826                 if (WARN_ON(!size))
 827                         break;
 828         }
 829
 830         iommu_domain_free(iommu->domain);
 831         iommu->domain = NULL;
 832         kfree(iommu);
 833 }
 834
 835 static long vfio_iommu_type1_ioctl(void *iommu_data,
 836                                    unsigned int cmd, unsigned long arg)
 837 {
 838         struct vfio_iommu *iommu = iommu_data;
 839         unsigned long minsz;
 840
 841         if (cmd == VFIO_CHECK_EXTENSION) {
 842                 switch (arg) {
 843                 case VFIO_TYPE1_IOMMU:
 844                         return 1;
 845                 default:
 846                         return 0;
 847                 }
 848         } else if (cmd == VFIO_IOMMU_GET_INFO) {
 849                 struct vfio_iommu_type1_info info;
 850
 851                 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 852
 853                 if (copy_from_user(&info, (void __user *)arg, minsz))
 854                         return -EFAULT;
 855
 856                 if (info.argsz < minsz)
 857                         return -EINVAL;
 858
 859                 info.flags = 0;
 860
 861                 info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap;
 862
 863                 return copy_to_user((void __user *)arg, &info, minsz);
 864
 865         } else if (cmd == VFIO_IOMMU_MAP_DMA) {
 866                 struct vfio_iommu_type1_dma_map map;
 867                 uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
 868                                 VFIO_DMA_MAP_FLAG_WRITE;
 869
 870                 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 871
 872                 if (copy_from_user(&map, (void __user *)arg, minsz))
 873                         return -EFAULT;
 874
 875                 if (map.argsz < minsz || map.flags & ~mask)
 876                         return -EINVAL;
 877
 878                 return vfio_dma_do_map(iommu, &map);
 879
 880         } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 881                 struct vfio_iommu_type1_dma_unmap unmap;
 882                 long ret;
 883
 884                 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
 885
 886                 if (copy_from_user(&unmap, (void __user *)arg, minsz))
 887                         return -EFAULT;
 888
 889                 if (unmap.argsz < minsz || unmap.flags)
 890                         return -EINVAL;
 891
 892                 ret = vfio_dma_do_unmap(iommu, &unmap);
 893                 if (ret)
 894                         return ret;
 895
 896                 return copy_to_user((void __user *)arg, &unmap, minsz);
 897         }
 898
 899         return -ENOTTY;
 900 }
 901
 902 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
 903         .name           = "vfio-iommu-type1",
 904         .owner          = THIS_MODULE,
 905         .open           = vfio_iommu_type1_open,
 906         .release        = vfio_iommu_type1_release,
 907         .ioctl          = vfio_iommu_type1_ioctl,
 908         .attach_group   = vfio_iommu_type1_attach_group,
 909         .detach_group   = vfio_iommu_type1_detach_group,
 910 };
 911
 912 static int __init vfio_iommu_type1_init(void)
 913 {
 914         if (!iommu_present(&pci_bus_type))
 915                 return -ENODEV;
 916
 917         return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
 918 }
 919
 920 static void __exit vfio_iommu_type1_cleanup(void)
 921 {
 922         vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
 923 }
 924
 925 module_init(vfio_iommu_type1_init);
 926 module_exit(vfio_iommu_type1_cleanup);
 927
 928 MODULE_VERSION(DRIVER_VERSION);
 929 MODULE_LICENSE("GPL v2");
 930 MODULE_AUTHOR(DRIVER_AUTHOR);
 931 MODULE_DESCRIPTION(DRIVER_DESC);