]> Pileus Git - ~andy/linux/blob - drivers/iommu/intel-iommu.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie...
[~andy/linux] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #define ROOT_SIZE               VTD_PAGE_SIZE
49 #define CONTEXT_SIZE            VTD_PAGE_SIZE
50
51 #define IS_BRIDGE_HOST_DEVICE(pdev) \
52                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
53 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
54 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
55 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56
57 #define IOAPIC_RANGE_START      (0xfee00000)
58 #define IOAPIC_RANGE_END        (0xfeefffff)
59 #define IOVA_START_ADDR         (0x1000)
60
61 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62
63 #define MAX_AGAW_WIDTH 64
64
65 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
71                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73
74 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
77
78 /* page table handling */
79 #define LEVEL_STRIDE            (9)
80 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
81
82 static inline int agaw_to_level(int agaw)
83 {
84         return agaw + 2;
85 }
86
87 static inline int agaw_to_width(int agaw)
88 {
89         return 30 + agaw * LEVEL_STRIDE;
90 }
91
92 static inline int width_to_agaw(int width)
93 {
94         return (width - 30) / LEVEL_STRIDE;
95 }
96
97 static inline unsigned int level_to_offset_bits(int level)
98 {
99         return (level - 1) * LEVEL_STRIDE;
100 }
101
102 static inline int pfn_level_offset(unsigned long pfn, int level)
103 {
104         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
105 }
106
107 static inline unsigned long level_mask(int level)
108 {
109         return -1UL << level_to_offset_bits(level);
110 }
111
112 static inline unsigned long level_size(int level)
113 {
114         return 1UL << level_to_offset_bits(level);
115 }
116
117 static inline unsigned long align_to_level(unsigned long pfn, int level)
118 {
119         return (pfn + level_size(level) - 1) & level_mask(level);
120 }
121
122 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
123 {
124         return  1 << ((lvl - 1) * LEVEL_STRIDE);
125 }
126
127 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
128    are never going to work. */
129 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
130 {
131         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
132 }
133
134 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
135 {
136         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
137 }
138 static inline unsigned long page_to_dma_pfn(struct page *pg)
139 {
140         return mm_to_dma_pfn(page_to_pfn(pg));
141 }
142 static inline unsigned long virt_to_dma_pfn(void *p)
143 {
144         return page_to_dma_pfn(virt_to_page(p));
145 }
146
147 /* global iommu list, set NULL for ignored DMAR units */
148 static struct intel_iommu **g_iommus;
149
150 static void __init check_tylersburg_isoch(void);
151 static int rwbf_quirk;
152
153 /*
154  * set to 1 to panic kernel if can't successfully enable VT-d
155  * (used when kernel is launched w/ TXT)
156  */
157 static int force_on = 0;
158
159 /*
160  * 0: Present
161  * 1-11: Reserved
162  * 12-63: Context Ptr (12 - (haw-1))
163  * 64-127: Reserved
164  */
165 struct root_entry {
166         u64     val;
167         u64     rsvd1;
168 };
169 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
170 static inline bool root_present(struct root_entry *root)
171 {
172         return (root->val & 1);
173 }
174 static inline void set_root_present(struct root_entry *root)
175 {
176         root->val |= 1;
177 }
178 static inline void set_root_value(struct root_entry *root, unsigned long value)
179 {
180         root->val |= value & VTD_PAGE_MASK;
181 }
182
183 static inline struct context_entry *
184 get_context_addr_from_root(struct root_entry *root)
185 {
186         return (struct context_entry *)
187                 (root_present(root)?phys_to_virt(
188                 root->val & VTD_PAGE_MASK) :
189                 NULL);
190 }
191
192 /*
193  * low 64 bits:
194  * 0: present
195  * 1: fault processing disable
196  * 2-3: translation type
197  * 12-63: address space root
198  * high 64 bits:
199  * 0-2: address width
200  * 3-6: aval
201  * 8-23: domain id
202  */
203 struct context_entry {
204         u64 lo;
205         u64 hi;
206 };
207
208 static inline bool context_present(struct context_entry *context)
209 {
210         return (context->lo & 1);
211 }
212 static inline void context_set_present(struct context_entry *context)
213 {
214         context->lo |= 1;
215 }
216
217 static inline void context_set_fault_enable(struct context_entry *context)
218 {
219         context->lo &= (((u64)-1) << 2) | 1;
220 }
221
222 static inline void context_set_translation_type(struct context_entry *context,
223                                                 unsigned long value)
224 {
225         context->lo &= (((u64)-1) << 4) | 3;
226         context->lo |= (value & 3) << 2;
227 }
228
229 static inline void context_set_address_root(struct context_entry *context,
230                                             unsigned long value)
231 {
232         context->lo |= value & VTD_PAGE_MASK;
233 }
234
235 static inline void context_set_address_width(struct context_entry *context,
236                                              unsigned long value)
237 {
238         context->hi |= value & 7;
239 }
240
241 static inline void context_set_domain_id(struct context_entry *context,
242                                          unsigned long value)
243 {
244         context->hi |= (value & ((1 << 16) - 1)) << 8;
245 }
246
247 static inline void context_clear_entry(struct context_entry *context)
248 {
249         context->lo = 0;
250         context->hi = 0;
251 }
252
253 /*
254  * 0: readable
255  * 1: writable
256  * 2-6: reserved
257  * 7: super page
258  * 8-10: available
259  * 11: snoop behavior
260  * 12-63: Host physcial address
261  */
262 struct dma_pte {
263         u64 val;
264 };
265
266 static inline void dma_clear_pte(struct dma_pte *pte)
267 {
268         pte->val = 0;
269 }
270
271 static inline void dma_set_pte_readable(struct dma_pte *pte)
272 {
273         pte->val |= DMA_PTE_READ;
274 }
275
276 static inline void dma_set_pte_writable(struct dma_pte *pte)
277 {
278         pte->val |= DMA_PTE_WRITE;
279 }
280
281 static inline void dma_set_pte_snp(struct dma_pte *pte)
282 {
283         pte->val |= DMA_PTE_SNP;
284 }
285
286 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
287 {
288         pte->val = (pte->val & ~3) | (prot & 3);
289 }
290
291 static inline u64 dma_pte_addr(struct dma_pte *pte)
292 {
293 #ifdef CONFIG_64BIT
294         return pte->val & VTD_PAGE_MASK;
295 #else
296         /* Must have a full atomic 64-bit read */
297         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
298 #endif
299 }
300
301 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
302 {
303         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
304 }
305
306 static inline bool dma_pte_present(struct dma_pte *pte)
307 {
308         return (pte->val & 3) != 0;
309 }
310
311 static inline bool dma_pte_superpage(struct dma_pte *pte)
312 {
313         return (pte->val & (1 << 7));
314 }
315
316 static inline int first_pte_in_page(struct dma_pte *pte)
317 {
318         return !((unsigned long)pte & ~VTD_PAGE_MASK);
319 }
320
321 /*
322  * This domain is a statically identity mapping domain.
323  *      1. This domain creats a static 1:1 mapping to all usable memory.
324  *      2. It maps to each iommu if successful.
325  *      3. Each iommu mapps to this domain if successful.
326  */
327 static struct dmar_domain *si_domain;
328 static int hw_pass_through = 1;
329
330 /* devices under the same p2p bridge are owned in one domain */
331 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
332
333 /* domain represents a virtual machine, more than one devices
334  * across iommus may be owned in one domain, e.g. kvm guest.
335  */
336 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
337
338 /* si_domain contains mulitple devices */
339 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
340
341 struct dmar_domain {
342         int     id;                     /* domain id */
343         int     nid;                    /* node id */
344         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
345
346         struct list_head devices;       /* all devices' list */
347         struct iova_domain iovad;       /* iova's that belong to this domain */
348
349         struct dma_pte  *pgd;           /* virtual address */
350         int             gaw;            /* max guest address width */
351
352         /* adjusted guest address width, 0 is level 2 30-bit */
353         int             agaw;
354
355         int             flags;          /* flags to find out type of domain */
356
357         int             iommu_coherency;/* indicate coherency of iommu access */
358         int             iommu_snooping; /* indicate snooping control feature*/
359         int             iommu_count;    /* reference count of iommu */
360         int             iommu_superpage;/* Level of superpages supported:
361                                            0 == 4KiB (no superpages), 1 == 2MiB,
362                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
363         spinlock_t      iommu_lock;     /* protect iommu set in domain */
364         u64             max_addr;       /* maximum mapped address */
365 };
366
367 /* PCI domain-device relationship */
368 struct device_domain_info {
369         struct list_head link;  /* link to domain siblings */
370         struct list_head global; /* link to global list */
371         int segment;            /* PCI domain */
372         u8 bus;                 /* PCI bus number */
373         u8 devfn;               /* PCI devfn number */
374         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
375         struct intel_iommu *iommu; /* IOMMU used by this device */
376         struct dmar_domain *domain; /* pointer to domain */
377 };
378
379 static void flush_unmaps_timeout(unsigned long data);
380
381 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
382
383 #define HIGH_WATER_MARK 250
384 struct deferred_flush_tables {
385         int next;
386         struct iova *iova[HIGH_WATER_MARK];
387         struct dmar_domain *domain[HIGH_WATER_MARK];
388 };
389
390 static struct deferred_flush_tables *deferred_flush;
391
392 /* bitmap for indexing intel_iommus */
393 static int g_num_of_iommus;
394
395 static DEFINE_SPINLOCK(async_umap_flush_lock);
396 static LIST_HEAD(unmaps_to_do);
397
398 static int timer_on;
399 static long list_size;
400
401 static void domain_remove_dev_info(struct dmar_domain *domain);
402
403 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
404 int dmar_disabled = 0;
405 #else
406 int dmar_disabled = 1;
407 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
408
409 int intel_iommu_enabled = 0;
410 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
411
412 static int dmar_map_gfx = 1;
413 static int dmar_forcedac;
414 static int intel_iommu_strict;
415 static int intel_iommu_superpage = 1;
416
417 int intel_iommu_gfx_mapped;
418 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
419
420 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
421 static DEFINE_SPINLOCK(device_domain_lock);
422 static LIST_HEAD(device_domain_list);
423
424 static struct iommu_ops intel_iommu_ops;
425
426 static int __init intel_iommu_setup(char *str)
427 {
428         if (!str)
429                 return -EINVAL;
430         while (*str) {
431                 if (!strncmp(str, "on", 2)) {
432                         dmar_disabled = 0;
433                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
434                 } else if (!strncmp(str, "off", 3)) {
435                         dmar_disabled = 1;
436                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
437                 } else if (!strncmp(str, "igfx_off", 8)) {
438                         dmar_map_gfx = 0;
439                         printk(KERN_INFO
440                                 "Intel-IOMMU: disable GFX device mapping\n");
441                 } else if (!strncmp(str, "forcedac", 8)) {
442                         printk(KERN_INFO
443                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
444                         dmar_forcedac = 1;
445                 } else if (!strncmp(str, "strict", 6)) {
446                         printk(KERN_INFO
447                                 "Intel-IOMMU: disable batched IOTLB flush\n");
448                         intel_iommu_strict = 1;
449                 } else if (!strncmp(str, "sp_off", 6)) {
450                         printk(KERN_INFO
451                                 "Intel-IOMMU: disable supported super page\n");
452                         intel_iommu_superpage = 0;
453                 }
454
455                 str += strcspn(str, ",");
456                 while (*str == ',')
457                         str++;
458         }
459         return 0;
460 }
461 __setup("intel_iommu=", intel_iommu_setup);
462
463 static struct kmem_cache *iommu_domain_cache;
464 static struct kmem_cache *iommu_devinfo_cache;
465 static struct kmem_cache *iommu_iova_cache;
466
467 static inline void *alloc_pgtable_page(int node)
468 {
469         struct page *page;
470         void *vaddr = NULL;
471
472         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
473         if (page)
474                 vaddr = page_address(page);
475         return vaddr;
476 }
477
478 static inline void free_pgtable_page(void *vaddr)
479 {
480         free_page((unsigned long)vaddr);
481 }
482
483 static inline void *alloc_domain_mem(void)
484 {
485         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
486 }
487
488 static void free_domain_mem(void *vaddr)
489 {
490         kmem_cache_free(iommu_domain_cache, vaddr);
491 }
492
493 static inline void * alloc_devinfo_mem(void)
494 {
495         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
496 }
497
498 static inline void free_devinfo_mem(void *vaddr)
499 {
500         kmem_cache_free(iommu_devinfo_cache, vaddr);
501 }
502
503 struct iova *alloc_iova_mem(void)
504 {
505         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
506 }
507
508 void free_iova_mem(struct iova *iova)
509 {
510         kmem_cache_free(iommu_iova_cache, iova);
511 }
512
513
514 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
515 {
516         unsigned long sagaw;
517         int agaw = -1;
518
519         sagaw = cap_sagaw(iommu->cap);
520         for (agaw = width_to_agaw(max_gaw);
521              agaw >= 0; agaw--) {
522                 if (test_bit(agaw, &sagaw))
523                         break;
524         }
525
526         return agaw;
527 }
528
529 /*
530  * Calculate max SAGAW for each iommu.
531  */
532 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
533 {
534         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
535 }
536
537 /*
538  * calculate agaw for each iommu.
539  * "SAGAW" may be different across iommus, use a default agaw, and
540  * get a supported less agaw for iommus that don't support the default agaw.
541  */
542 int iommu_calculate_agaw(struct intel_iommu *iommu)
543 {
544         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
545 }
546
547 /* This functionin only returns single iommu in a domain */
548 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
549 {
550         int iommu_id;
551
552         /* si_domain and vm domain should not get here. */
553         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
554         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
555
556         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
557         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
558                 return NULL;
559
560         return g_iommus[iommu_id];
561 }
562
563 static void domain_update_iommu_coherency(struct dmar_domain *domain)
564 {
565         int i;
566
567         domain->iommu_coherency = 1;
568
569         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
570                 if (!ecap_coherent(g_iommus[i]->ecap)) {
571                         domain->iommu_coherency = 0;
572                         break;
573                 }
574         }
575 }
576
577 static void domain_update_iommu_snooping(struct dmar_domain *domain)
578 {
579         int i;
580
581         domain->iommu_snooping = 1;
582
583         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
584                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
585                         domain->iommu_snooping = 0;
586                         break;
587                 }
588         }
589 }
590
591 static void domain_update_iommu_superpage(struct dmar_domain *domain)
592 {
593         struct dmar_drhd_unit *drhd;
594         struct intel_iommu *iommu = NULL;
595         int mask = 0xf;
596
597         if (!intel_iommu_superpage) {
598                 domain->iommu_superpage = 0;
599                 return;
600         }
601
602         /* set iommu_superpage to the smallest common denominator */
603         for_each_active_iommu(iommu, drhd) {
604                 mask &= cap_super_page_val(iommu->cap);
605                 if (!mask) {
606                         break;
607                 }
608         }
609         domain->iommu_superpage = fls(mask);
610 }
611
612 /* Some capabilities may be different across iommus */
613 static void domain_update_iommu_cap(struct dmar_domain *domain)
614 {
615         domain_update_iommu_coherency(domain);
616         domain_update_iommu_snooping(domain);
617         domain_update_iommu_superpage(domain);
618 }
619
620 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
621 {
622         struct dmar_drhd_unit *drhd = NULL;
623         int i;
624
625         for_each_drhd_unit(drhd) {
626                 if (drhd->ignored)
627                         continue;
628                 if (segment != drhd->segment)
629                         continue;
630
631                 for (i = 0; i < drhd->devices_cnt; i++) {
632                         if (drhd->devices[i] &&
633                             drhd->devices[i]->bus->number == bus &&
634                             drhd->devices[i]->devfn == devfn)
635                                 return drhd->iommu;
636                         if (drhd->devices[i] &&
637                             drhd->devices[i]->subordinate &&
638                             drhd->devices[i]->subordinate->number <= bus &&
639                             drhd->devices[i]->subordinate->subordinate >= bus)
640                                 return drhd->iommu;
641                 }
642
643                 if (drhd->include_all)
644                         return drhd->iommu;
645         }
646
647         return NULL;
648 }
649
650 static void domain_flush_cache(struct dmar_domain *domain,
651                                void *addr, int size)
652 {
653         if (!domain->iommu_coherency)
654                 clflush_cache_range(addr, size);
655 }
656
657 /* Gets context entry for a given bus and devfn */
658 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
659                 u8 bus, u8 devfn)
660 {
661         struct root_entry *root;
662         struct context_entry *context;
663         unsigned long phy_addr;
664         unsigned long flags;
665
666         spin_lock_irqsave(&iommu->lock, flags);
667         root = &iommu->root_entry[bus];
668         context = get_context_addr_from_root(root);
669         if (!context) {
670                 context = (struct context_entry *)
671                                 alloc_pgtable_page(iommu->node);
672                 if (!context) {
673                         spin_unlock_irqrestore(&iommu->lock, flags);
674                         return NULL;
675                 }
676                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
677                 phy_addr = virt_to_phys((void *)context);
678                 set_root_value(root, phy_addr);
679                 set_root_present(root);
680                 __iommu_flush_cache(iommu, root, sizeof(*root));
681         }
682         spin_unlock_irqrestore(&iommu->lock, flags);
683         return &context[devfn];
684 }
685
686 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
687 {
688         struct root_entry *root;
689         struct context_entry *context;
690         int ret;
691         unsigned long flags;
692
693         spin_lock_irqsave(&iommu->lock, flags);
694         root = &iommu->root_entry[bus];
695         context = get_context_addr_from_root(root);
696         if (!context) {
697                 ret = 0;
698                 goto out;
699         }
700         ret = context_present(&context[devfn]);
701 out:
702         spin_unlock_irqrestore(&iommu->lock, flags);
703         return ret;
704 }
705
706 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
707 {
708         struct root_entry *root;
709         struct context_entry *context;
710         unsigned long flags;
711
712         spin_lock_irqsave(&iommu->lock, flags);
713         root = &iommu->root_entry[bus];
714         context = get_context_addr_from_root(root);
715         if (context) {
716                 context_clear_entry(&context[devfn]);
717                 __iommu_flush_cache(iommu, &context[devfn], \
718                         sizeof(*context));
719         }
720         spin_unlock_irqrestore(&iommu->lock, flags);
721 }
722
723 static void free_context_table(struct intel_iommu *iommu)
724 {
725         struct root_entry *root;
726         int i;
727         unsigned long flags;
728         struct context_entry *context;
729
730         spin_lock_irqsave(&iommu->lock, flags);
731         if (!iommu->root_entry) {
732                 goto out;
733         }
734         for (i = 0; i < ROOT_ENTRY_NR; i++) {
735                 root = &iommu->root_entry[i];
736                 context = get_context_addr_from_root(root);
737                 if (context)
738                         free_pgtable_page(context);
739         }
740         free_pgtable_page(iommu->root_entry);
741         iommu->root_entry = NULL;
742 out:
743         spin_unlock_irqrestore(&iommu->lock, flags);
744 }
745
746 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
747                                       unsigned long pfn, int target_level)
748 {
749         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
750         struct dma_pte *parent, *pte = NULL;
751         int level = agaw_to_level(domain->agaw);
752         int offset;
753
754         BUG_ON(!domain->pgd);
755         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
756         parent = domain->pgd;
757
758         while (level > 0) {
759                 void *tmp_page;
760
761                 offset = pfn_level_offset(pfn, level);
762                 pte = &parent[offset];
763                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
764                         break;
765                 if (level == target_level)
766                         break;
767
768                 if (!dma_pte_present(pte)) {
769                         uint64_t pteval;
770
771                         tmp_page = alloc_pgtable_page(domain->nid);
772
773                         if (!tmp_page)
774                                 return NULL;
775
776                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
777                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
778                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
779                                 /* Someone else set it while we were thinking; use theirs. */
780                                 free_pgtable_page(tmp_page);
781                         } else {
782                                 dma_pte_addr(pte);
783                                 domain_flush_cache(domain, pte, sizeof(*pte));
784                         }
785                 }
786                 parent = phys_to_virt(dma_pte_addr(pte));
787                 level--;
788         }
789
790         return pte;
791 }
792
793
794 /* return address's pte at specific level */
795 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
796                                          unsigned long pfn,
797                                          int level, int *large_page)
798 {
799         struct dma_pte *parent, *pte = NULL;
800         int total = agaw_to_level(domain->agaw);
801         int offset;
802
803         parent = domain->pgd;
804         while (level <= total) {
805                 offset = pfn_level_offset(pfn, total);
806                 pte = &parent[offset];
807                 if (level == total)
808                         return pte;
809
810                 if (!dma_pte_present(pte)) {
811                         *large_page = total;
812                         break;
813                 }
814
815                 if (pte->val & DMA_PTE_LARGE_PAGE) {
816                         *large_page = total;
817                         return pte;
818                 }
819
820                 parent = phys_to_virt(dma_pte_addr(pte));
821                 total--;
822         }
823         return NULL;
824 }
825
826 /* clear last level pte, a tlb flush should be followed */
827 static int dma_pte_clear_range(struct dmar_domain *domain,
828                                 unsigned long start_pfn,
829                                 unsigned long last_pfn)
830 {
831         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
832         unsigned int large_page = 1;
833         struct dma_pte *first_pte, *pte;
834         int order;
835
836         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
837         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
838         BUG_ON(start_pfn > last_pfn);
839
840         /* we don't need lock here; nobody else touches the iova range */
841         do {
842                 large_page = 1;
843                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
844                 if (!pte) {
845                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
846                         continue;
847                 }
848                 do {
849                         dma_clear_pte(pte);
850                         start_pfn += lvl_to_nr_pages(large_page);
851                         pte++;
852                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
853
854                 domain_flush_cache(domain, first_pte,
855                                    (void *)pte - (void *)first_pte);
856
857         } while (start_pfn && start_pfn <= last_pfn);
858
859         order = (large_page - 1) * 9;
860         return order;
861 }
862
863 /* free page table pages. last level pte should already be cleared */
864 static void dma_pte_free_pagetable(struct dmar_domain *domain,
865                                    unsigned long start_pfn,
866                                    unsigned long last_pfn)
867 {
868         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
869         struct dma_pte *first_pte, *pte;
870         int total = agaw_to_level(domain->agaw);
871         int level;
872         unsigned long tmp;
873         int large_page = 2;
874
875         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
876         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
877         BUG_ON(start_pfn > last_pfn);
878
879         /* We don't need lock here; nobody else touches the iova range */
880         level = 2;
881         while (level <= total) {
882                 tmp = align_to_level(start_pfn, level);
883
884                 /* If we can't even clear one PTE at this level, we're done */
885                 if (tmp + level_size(level) - 1 > last_pfn)
886                         return;
887
888                 do {
889                         large_page = level;
890                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
891                         if (large_page > level)
892                                 level = large_page + 1;
893                         if (!pte) {
894                                 tmp = align_to_level(tmp + 1, level + 1);
895                                 continue;
896                         }
897                         do {
898                                 if (dma_pte_present(pte)) {
899                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
900                                         dma_clear_pte(pte);
901                                 }
902                                 pte++;
903                                 tmp += level_size(level);
904                         } while (!first_pte_in_page(pte) &&
905                                  tmp + level_size(level) - 1 <= last_pfn);
906
907                         domain_flush_cache(domain, first_pte,
908                                            (void *)pte - (void *)first_pte);
909                         
910                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
911                 level++;
912         }
913         /* free pgd */
914         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
915                 free_pgtable_page(domain->pgd);
916                 domain->pgd = NULL;
917         }
918 }
919
920 /* iommu handling */
921 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
922 {
923         struct root_entry *root;
924         unsigned long flags;
925
926         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
927         if (!root)
928                 return -ENOMEM;
929
930         __iommu_flush_cache(iommu, root, ROOT_SIZE);
931
932         spin_lock_irqsave(&iommu->lock, flags);
933         iommu->root_entry = root;
934         spin_unlock_irqrestore(&iommu->lock, flags);
935
936         return 0;
937 }
938
939 static void iommu_set_root_entry(struct intel_iommu *iommu)
940 {
941         void *addr;
942         u32 sts;
943         unsigned long flag;
944
945         addr = iommu->root_entry;
946
947         raw_spin_lock_irqsave(&iommu->register_lock, flag);
948         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
949
950         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
951
952         /* Make sure hardware complete it */
953         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
954                       readl, (sts & DMA_GSTS_RTPS), sts);
955
956         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
957 }
958
959 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
960 {
961         u32 val;
962         unsigned long flag;
963
964         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
965                 return;
966
967         raw_spin_lock_irqsave(&iommu->register_lock, flag);
968         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
969
970         /* Make sure hardware complete it */
971         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
972                       readl, (!(val & DMA_GSTS_WBFS)), val);
973
974         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
975 }
976
977 /* return value determine if we need a write buffer flush */
978 static void __iommu_flush_context(struct intel_iommu *iommu,
979                                   u16 did, u16 source_id, u8 function_mask,
980                                   u64 type)
981 {
982         u64 val = 0;
983         unsigned long flag;
984
985         switch (type) {
986         case DMA_CCMD_GLOBAL_INVL:
987                 val = DMA_CCMD_GLOBAL_INVL;
988                 break;
989         case DMA_CCMD_DOMAIN_INVL:
990                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
991                 break;
992         case DMA_CCMD_DEVICE_INVL:
993                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
994                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
995                 break;
996         default:
997                 BUG();
998         }
999         val |= DMA_CCMD_ICC;
1000
1001         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1002         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1003
1004         /* Make sure hardware complete it */
1005         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1006                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1007
1008         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1009 }
1010
1011 /* return value determine if we need a write buffer flush */
1012 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1013                                 u64 addr, unsigned int size_order, u64 type)
1014 {
1015         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1016         u64 val = 0, val_iva = 0;
1017         unsigned long flag;
1018
1019         switch (type) {
1020         case DMA_TLB_GLOBAL_FLUSH:
1021                 /* global flush doesn't need set IVA_REG */
1022                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1023                 break;
1024         case DMA_TLB_DSI_FLUSH:
1025                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1026                 break;
1027         case DMA_TLB_PSI_FLUSH:
1028                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1029                 /* Note: always flush non-leaf currently */
1030                 val_iva = size_order | addr;
1031                 break;
1032         default:
1033                 BUG();
1034         }
1035         /* Note: set drain read/write */
1036 #if 0
1037         /*
1038          * This is probably to be super secure.. Looks like we can
1039          * ignore it without any impact.
1040          */
1041         if (cap_read_drain(iommu->cap))
1042                 val |= DMA_TLB_READ_DRAIN;
1043 #endif
1044         if (cap_write_drain(iommu->cap))
1045                 val |= DMA_TLB_WRITE_DRAIN;
1046
1047         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1048         /* Note: Only uses first TLB reg currently */
1049         if (val_iva)
1050                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1051         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1052
1053         /* Make sure hardware complete it */
1054         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1055                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1056
1057         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1058
1059         /* check IOTLB invalidation granularity */
1060         if (DMA_TLB_IAIG(val) == 0)
1061                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1062         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1063                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1064                         (unsigned long long)DMA_TLB_IIRG(type),
1065                         (unsigned long long)DMA_TLB_IAIG(val));
1066 }
1067
1068 static struct device_domain_info *iommu_support_dev_iotlb(
1069         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1070 {
1071         int found = 0;
1072         unsigned long flags;
1073         struct device_domain_info *info;
1074         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1075
1076         if (!ecap_dev_iotlb_support(iommu->ecap))
1077                 return NULL;
1078
1079         if (!iommu->qi)
1080                 return NULL;
1081
1082         spin_lock_irqsave(&device_domain_lock, flags);
1083         list_for_each_entry(info, &domain->devices, link)
1084                 if (info->bus == bus && info->devfn == devfn) {
1085                         found = 1;
1086                         break;
1087                 }
1088         spin_unlock_irqrestore(&device_domain_lock, flags);
1089
1090         if (!found || !info->dev)
1091                 return NULL;
1092
1093         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1094                 return NULL;
1095
1096         if (!dmar_find_matched_atsr_unit(info->dev))
1097                 return NULL;
1098
1099         info->iommu = iommu;
1100
1101         return info;
1102 }
1103
1104 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1105 {
1106         if (!info)
1107                 return;
1108
1109         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1110 }
1111
1112 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1113 {
1114         if (!info->dev || !pci_ats_enabled(info->dev))
1115                 return;
1116
1117         pci_disable_ats(info->dev);
1118 }
1119
1120 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1121                                   u64 addr, unsigned mask)
1122 {
1123         u16 sid, qdep;
1124         unsigned long flags;
1125         struct device_domain_info *info;
1126
1127         spin_lock_irqsave(&device_domain_lock, flags);
1128         list_for_each_entry(info, &domain->devices, link) {
1129                 if (!info->dev || !pci_ats_enabled(info->dev))
1130                         continue;
1131
1132                 sid = info->bus << 8 | info->devfn;
1133                 qdep = pci_ats_queue_depth(info->dev);
1134                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1135         }
1136         spin_unlock_irqrestore(&device_domain_lock, flags);
1137 }
1138
1139 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1140                                   unsigned long pfn, unsigned int pages, int map)
1141 {
1142         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1143         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1144
1145         BUG_ON(pages == 0);
1146
1147         /*
1148          * Fallback to domain selective flush if no PSI support or the size is
1149          * too big.
1150          * PSI requires page size to be 2 ^ x, and the base address is naturally
1151          * aligned to the size
1152          */
1153         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1154                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1155                                                 DMA_TLB_DSI_FLUSH);
1156         else
1157                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1158                                                 DMA_TLB_PSI_FLUSH);
1159
1160         /*
1161          * In caching mode, changes of pages from non-present to present require
1162          * flush. However, device IOTLB doesn't need to be flushed in this case.
1163          */
1164         if (!cap_caching_mode(iommu->cap) || !map)
1165                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1166 }
1167
1168 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1169 {
1170         u32 pmen;
1171         unsigned long flags;
1172
1173         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1174         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1175         pmen &= ~DMA_PMEN_EPM;
1176         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1177
1178         /* wait for the protected region status bit to clear */
1179         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1180                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1181
1182         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1183 }
1184
1185 static int iommu_enable_translation(struct intel_iommu *iommu)
1186 {
1187         u32 sts;
1188         unsigned long flags;
1189
1190         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1191         iommu->gcmd |= DMA_GCMD_TE;
1192         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1193
1194         /* Make sure hardware complete it */
1195         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1196                       readl, (sts & DMA_GSTS_TES), sts);
1197
1198         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1199         return 0;
1200 }
1201
1202 static int iommu_disable_translation(struct intel_iommu *iommu)
1203 {
1204         u32 sts;
1205         unsigned long flag;
1206
1207         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1208         iommu->gcmd &= ~DMA_GCMD_TE;
1209         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1210
1211         /* Make sure hardware complete it */
1212         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213                       readl, (!(sts & DMA_GSTS_TES)), sts);
1214
1215         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1216         return 0;
1217 }
1218
1219
1220 static int iommu_init_domains(struct intel_iommu *iommu)
1221 {
1222         unsigned long ndomains;
1223         unsigned long nlongs;
1224
1225         ndomains = cap_ndoms(iommu->cap);
1226         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1227                         ndomains);
1228         nlongs = BITS_TO_LONGS(ndomains);
1229
1230         spin_lock_init(&iommu->lock);
1231
1232         /* TBD: there might be 64K domains,
1233          * consider other allocation for future chip
1234          */
1235         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1236         if (!iommu->domain_ids) {
1237                 printk(KERN_ERR "Allocating domain id array failed\n");
1238                 return -ENOMEM;
1239         }
1240         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1241                         GFP_KERNEL);
1242         if (!iommu->domains) {
1243                 printk(KERN_ERR "Allocating domain array failed\n");
1244                 return -ENOMEM;
1245         }
1246
1247         /*
1248          * if Caching mode is set, then invalid translations are tagged
1249          * with domainid 0. Hence we need to pre-allocate it.
1250          */
1251         if (cap_caching_mode(iommu->cap))
1252                 set_bit(0, iommu->domain_ids);
1253         return 0;
1254 }
1255
1256
1257 static void domain_exit(struct dmar_domain *domain);
1258 static void vm_domain_exit(struct dmar_domain *domain);
1259
1260 void free_dmar_iommu(struct intel_iommu *iommu)
1261 {
1262         struct dmar_domain *domain;
1263         int i;
1264         unsigned long flags;
1265
1266         if ((iommu->domains) && (iommu->domain_ids)) {
1267                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1268                         domain = iommu->domains[i];
1269                         clear_bit(i, iommu->domain_ids);
1270
1271                         spin_lock_irqsave(&domain->iommu_lock, flags);
1272                         if (--domain->iommu_count == 0) {
1273                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1274                                         vm_domain_exit(domain);
1275                                 else
1276                                         domain_exit(domain);
1277                         }
1278                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1279                 }
1280         }
1281
1282         if (iommu->gcmd & DMA_GCMD_TE)
1283                 iommu_disable_translation(iommu);
1284
1285         if (iommu->irq) {
1286                 irq_set_handler_data(iommu->irq, NULL);
1287                 /* This will mask the irq */
1288                 free_irq(iommu->irq, iommu);
1289                 destroy_irq(iommu->irq);
1290         }
1291
1292         kfree(iommu->domains);
1293         kfree(iommu->domain_ids);
1294
1295         g_iommus[iommu->seq_id] = NULL;
1296
1297         /* if all iommus are freed, free g_iommus */
1298         for (i = 0; i < g_num_of_iommus; i++) {
1299                 if (g_iommus[i])
1300                         break;
1301         }
1302
1303         if (i == g_num_of_iommus)
1304                 kfree(g_iommus);
1305
1306         /* free context mapping */
1307         free_context_table(iommu);
1308 }
1309
1310 static struct dmar_domain *alloc_domain(void)
1311 {
1312         struct dmar_domain *domain;
1313
1314         domain = alloc_domain_mem();
1315         if (!domain)
1316                 return NULL;
1317
1318         domain->nid = -1;
1319         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1320         domain->flags = 0;
1321
1322         return domain;
1323 }
1324
1325 static int iommu_attach_domain(struct dmar_domain *domain,
1326                                struct intel_iommu *iommu)
1327 {
1328         int num;
1329         unsigned long ndomains;
1330         unsigned long flags;
1331
1332         ndomains = cap_ndoms(iommu->cap);
1333
1334         spin_lock_irqsave(&iommu->lock, flags);
1335
1336         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1337         if (num >= ndomains) {
1338                 spin_unlock_irqrestore(&iommu->lock, flags);
1339                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1340                 return -ENOMEM;
1341         }
1342
1343         domain->id = num;
1344         set_bit(num, iommu->domain_ids);
1345         set_bit(iommu->seq_id, &domain->iommu_bmp);
1346         iommu->domains[num] = domain;
1347         spin_unlock_irqrestore(&iommu->lock, flags);
1348
1349         return 0;
1350 }
1351
1352 static void iommu_detach_domain(struct dmar_domain *domain,
1353                                 struct intel_iommu *iommu)
1354 {
1355         unsigned long flags;
1356         int num, ndomains;
1357         int found = 0;
1358
1359         spin_lock_irqsave(&iommu->lock, flags);
1360         ndomains = cap_ndoms(iommu->cap);
1361         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1362                 if (iommu->domains[num] == domain) {
1363                         found = 1;
1364                         break;
1365                 }
1366         }
1367
1368         if (found) {
1369                 clear_bit(num, iommu->domain_ids);
1370                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1371                 iommu->domains[num] = NULL;
1372         }
1373         spin_unlock_irqrestore(&iommu->lock, flags);
1374 }
1375
1376 static struct iova_domain reserved_iova_list;
1377 static struct lock_class_key reserved_rbtree_key;
1378
1379 static int dmar_init_reserved_ranges(void)
1380 {
1381         struct pci_dev *pdev = NULL;
1382         struct iova *iova;
1383         int i;
1384
1385         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1386
1387         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1388                 &reserved_rbtree_key);
1389
1390         /* IOAPIC ranges shouldn't be accessed by DMA */
1391         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1392                 IOVA_PFN(IOAPIC_RANGE_END));
1393         if (!iova) {
1394                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1395                 return -ENODEV;
1396         }
1397
1398         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1399         for_each_pci_dev(pdev) {
1400                 struct resource *r;
1401
1402                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1403                         r = &pdev->resource[i];
1404                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1405                                 continue;
1406                         iova = reserve_iova(&reserved_iova_list,
1407                                             IOVA_PFN(r->start),
1408                                             IOVA_PFN(r->end));
1409                         if (!iova) {
1410                                 printk(KERN_ERR "Reserve iova failed\n");
1411                                 return -ENODEV;
1412                         }
1413                 }
1414         }
1415         return 0;
1416 }
1417
1418 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1419 {
1420         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1421 }
1422
1423 static inline int guestwidth_to_adjustwidth(int gaw)
1424 {
1425         int agaw;
1426         int r = (gaw - 12) % 9;
1427
1428         if (r == 0)
1429                 agaw = gaw;
1430         else
1431                 agaw = gaw + 9 - r;
1432         if (agaw > 64)
1433                 agaw = 64;
1434         return agaw;
1435 }
1436
1437 static int domain_init(struct dmar_domain *domain, int guest_width)
1438 {
1439         struct intel_iommu *iommu;
1440         int adjust_width, agaw;
1441         unsigned long sagaw;
1442
1443         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1444         spin_lock_init(&domain->iommu_lock);
1445
1446         domain_reserve_special_ranges(domain);
1447
1448         /* calculate AGAW */
1449         iommu = domain_get_iommu(domain);
1450         if (guest_width > cap_mgaw(iommu->cap))
1451                 guest_width = cap_mgaw(iommu->cap);
1452         domain->gaw = guest_width;
1453         adjust_width = guestwidth_to_adjustwidth(guest_width);
1454         agaw = width_to_agaw(adjust_width);
1455         sagaw = cap_sagaw(iommu->cap);
1456         if (!test_bit(agaw, &sagaw)) {
1457                 /* hardware doesn't support it, choose a bigger one */
1458                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1459                 agaw = find_next_bit(&sagaw, 5, agaw);
1460                 if (agaw >= 5)
1461                         return -ENODEV;
1462         }
1463         domain->agaw = agaw;
1464         INIT_LIST_HEAD(&domain->devices);
1465
1466         if (ecap_coherent(iommu->ecap))
1467                 domain->iommu_coherency = 1;
1468         else
1469                 domain->iommu_coherency = 0;
1470
1471         if (ecap_sc_support(iommu->ecap))
1472                 domain->iommu_snooping = 1;
1473         else
1474                 domain->iommu_snooping = 0;
1475
1476         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1477         domain->iommu_count = 1;
1478         domain->nid = iommu->node;
1479
1480         /* always allocate the top pgd */
1481         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1482         if (!domain->pgd)
1483                 return -ENOMEM;
1484         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1485         return 0;
1486 }
1487
1488 static void domain_exit(struct dmar_domain *domain)
1489 {
1490         struct dmar_drhd_unit *drhd;
1491         struct intel_iommu *iommu;
1492
1493         /* Domain 0 is reserved, so dont process it */
1494         if (!domain)
1495                 return;
1496
1497         /* Flush any lazy unmaps that may reference this domain */
1498         if (!intel_iommu_strict)
1499                 flush_unmaps_timeout(0);
1500
1501         domain_remove_dev_info(domain);
1502         /* destroy iovas */
1503         put_iova_domain(&domain->iovad);
1504
1505         /* clear ptes */
1506         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1507
1508         /* free page tables */
1509         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1510
1511         for_each_active_iommu(iommu, drhd)
1512                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1513                         iommu_detach_domain(domain, iommu);
1514
1515         free_domain_mem(domain);
1516 }
1517
1518 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1519                                  u8 bus, u8 devfn, int translation)
1520 {
1521         struct context_entry *context;
1522         unsigned long flags;
1523         struct intel_iommu *iommu;
1524         struct dma_pte *pgd;
1525         unsigned long num;
1526         unsigned long ndomains;
1527         int id;
1528         int agaw;
1529         struct device_domain_info *info = NULL;
1530
1531         pr_debug("Set context mapping for %02x:%02x.%d\n",
1532                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1533
1534         BUG_ON(!domain->pgd);
1535         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1536                translation != CONTEXT_TT_MULTI_LEVEL);
1537
1538         iommu = device_to_iommu(segment, bus, devfn);
1539         if (!iommu)
1540                 return -ENODEV;
1541
1542         context = device_to_context_entry(iommu, bus, devfn);
1543         if (!context)
1544                 return -ENOMEM;
1545         spin_lock_irqsave(&iommu->lock, flags);
1546         if (context_present(context)) {
1547                 spin_unlock_irqrestore(&iommu->lock, flags);
1548                 return 0;
1549         }
1550
1551         id = domain->id;
1552         pgd = domain->pgd;
1553
1554         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1555             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1556                 int found = 0;
1557
1558                 /* find an available domain id for this device in iommu */
1559                 ndomains = cap_ndoms(iommu->cap);
1560                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1561                         if (iommu->domains[num] == domain) {
1562                                 id = num;
1563                                 found = 1;
1564                                 break;
1565                         }
1566                 }
1567
1568                 if (found == 0) {
1569                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1570                         if (num >= ndomains) {
1571                                 spin_unlock_irqrestore(&iommu->lock, flags);
1572                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1573                                 return -EFAULT;
1574                         }
1575
1576                         set_bit(num, iommu->domain_ids);
1577                         iommu->domains[num] = domain;
1578                         id = num;
1579                 }
1580
1581                 /* Skip top levels of page tables for
1582                  * iommu which has less agaw than default.
1583                  * Unnecessary for PT mode.
1584                  */
1585                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1586                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1587                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1588                                 if (!dma_pte_present(pgd)) {
1589                                         spin_unlock_irqrestore(&iommu->lock, flags);
1590                                         return -ENOMEM;
1591                                 }
1592                         }
1593                 }
1594         }
1595
1596         context_set_domain_id(context, id);
1597
1598         if (translation != CONTEXT_TT_PASS_THROUGH) {
1599                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1600                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1601                                      CONTEXT_TT_MULTI_LEVEL;
1602         }
1603         /*
1604          * In pass through mode, AW must be programmed to indicate the largest
1605          * AGAW value supported by hardware. And ASR is ignored by hardware.
1606          */
1607         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1608                 context_set_address_width(context, iommu->msagaw);
1609         else {
1610                 context_set_address_root(context, virt_to_phys(pgd));
1611                 context_set_address_width(context, iommu->agaw);
1612         }
1613
1614         context_set_translation_type(context, translation);
1615         context_set_fault_enable(context);
1616         context_set_present(context);
1617         domain_flush_cache(domain, context, sizeof(*context));
1618
1619         /*
1620          * It's a non-present to present mapping. If hardware doesn't cache
1621          * non-present entry we only need to flush the write-buffer. If the
1622          * _does_ cache non-present entries, then it does so in the special
1623          * domain #0, which we have to flush:
1624          */
1625         if (cap_caching_mode(iommu->cap)) {
1626                 iommu->flush.flush_context(iommu, 0,
1627                                            (((u16)bus) << 8) | devfn,
1628                                            DMA_CCMD_MASK_NOBIT,
1629                                            DMA_CCMD_DEVICE_INVL);
1630                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1631         } else {
1632                 iommu_flush_write_buffer(iommu);
1633         }
1634         iommu_enable_dev_iotlb(info);
1635         spin_unlock_irqrestore(&iommu->lock, flags);
1636
1637         spin_lock_irqsave(&domain->iommu_lock, flags);
1638         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1639                 domain->iommu_count++;
1640                 if (domain->iommu_count == 1)
1641                         domain->nid = iommu->node;
1642                 domain_update_iommu_cap(domain);
1643         }
1644         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1645         return 0;
1646 }
1647
1648 static int
1649 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1650                         int translation)
1651 {
1652         int ret;
1653         struct pci_dev *tmp, *parent;
1654
1655         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1656                                          pdev->bus->number, pdev->devfn,
1657                                          translation);
1658         if (ret)
1659                 return ret;
1660
1661         /* dependent device mapping */
1662         tmp = pci_find_upstream_pcie_bridge(pdev);
1663         if (!tmp)
1664                 return 0;
1665         /* Secondary interface's bus number and devfn 0 */
1666         parent = pdev->bus->self;
1667         while (parent != tmp) {
1668                 ret = domain_context_mapping_one(domain,
1669                                                  pci_domain_nr(parent->bus),
1670                                                  parent->bus->number,
1671                                                  parent->devfn, translation);
1672                 if (ret)
1673                         return ret;
1674                 parent = parent->bus->self;
1675         }
1676         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1677                 return domain_context_mapping_one(domain,
1678                                         pci_domain_nr(tmp->subordinate),
1679                                         tmp->subordinate->number, 0,
1680                                         translation);
1681         else /* this is a legacy PCI bridge */
1682                 return domain_context_mapping_one(domain,
1683                                                   pci_domain_nr(tmp->bus),
1684                                                   tmp->bus->number,
1685                                                   tmp->devfn,
1686                                                   translation);
1687 }
1688
1689 static int domain_context_mapped(struct pci_dev *pdev)
1690 {
1691         int ret;
1692         struct pci_dev *tmp, *parent;
1693         struct intel_iommu *iommu;
1694
1695         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1696                                 pdev->devfn);
1697         if (!iommu)
1698                 return -ENODEV;
1699
1700         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1701         if (!ret)
1702                 return ret;
1703         /* dependent device mapping */
1704         tmp = pci_find_upstream_pcie_bridge(pdev);
1705         if (!tmp)
1706                 return ret;
1707         /* Secondary interface's bus number and devfn 0 */
1708         parent = pdev->bus->self;
1709         while (parent != tmp) {
1710                 ret = device_context_mapped(iommu, parent->bus->number,
1711                                             parent->devfn);
1712                 if (!ret)
1713                         return ret;
1714                 parent = parent->bus->self;
1715         }
1716         if (pci_is_pcie(tmp))
1717                 return device_context_mapped(iommu, tmp->subordinate->number,
1718                                              0);
1719         else
1720                 return device_context_mapped(iommu, tmp->bus->number,
1721                                              tmp->devfn);
1722 }
1723
1724 /* Returns a number of VTD pages, but aligned to MM page size */
1725 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1726                                             size_t size)
1727 {
1728         host_addr &= ~PAGE_MASK;
1729         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1730 }
1731
1732 /* Return largest possible superpage level for a given mapping */
1733 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1734                                           unsigned long iov_pfn,
1735                                           unsigned long phy_pfn,
1736                                           unsigned long pages)
1737 {
1738         int support, level = 1;
1739         unsigned long pfnmerge;
1740
1741         support = domain->iommu_superpage;
1742
1743         /* To use a large page, the virtual *and* physical addresses
1744            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1745            of them will mean we have to use smaller pages. So just
1746            merge them and check both at once. */
1747         pfnmerge = iov_pfn | phy_pfn;
1748
1749         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1750                 pages >>= VTD_STRIDE_SHIFT;
1751                 if (!pages)
1752                         break;
1753                 pfnmerge >>= VTD_STRIDE_SHIFT;
1754                 level++;
1755                 support--;
1756         }
1757         return level;
1758 }
1759
1760 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1761                             struct scatterlist *sg, unsigned long phys_pfn,
1762                             unsigned long nr_pages, int prot)
1763 {
1764         struct dma_pte *first_pte = NULL, *pte = NULL;
1765         phys_addr_t uninitialized_var(pteval);
1766         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1767         unsigned long sg_res;
1768         unsigned int largepage_lvl = 0;
1769         unsigned long lvl_pages = 0;
1770
1771         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1772
1773         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1774                 return -EINVAL;
1775
1776         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1777
1778         if (sg)
1779                 sg_res = 0;
1780         else {
1781                 sg_res = nr_pages + 1;
1782                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1783         }
1784
1785         while (nr_pages > 0) {
1786                 uint64_t tmp;
1787
1788                 if (!sg_res) {
1789                         sg_res = aligned_nrpages(sg->offset, sg->length);
1790                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1791                         sg->dma_length = sg->length;
1792                         pteval = page_to_phys(sg_page(sg)) | prot;
1793                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1794                 }
1795
1796                 if (!pte) {
1797                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1798
1799                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1800                         if (!pte)
1801                                 return -ENOMEM;
1802                         /* It is large page*/
1803                         if (largepage_lvl > 1)
1804                                 pteval |= DMA_PTE_LARGE_PAGE;
1805                         else
1806                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1807
1808                 }
1809                 /* We don't need lock here, nobody else
1810                  * touches the iova range
1811                  */
1812                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1813                 if (tmp) {
1814                         static int dumps = 5;
1815                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1816                                iov_pfn, tmp, (unsigned long long)pteval);
1817                         if (dumps) {
1818                                 dumps--;
1819                                 debug_dma_dump_mappings(NULL);
1820                         }
1821                         WARN_ON(1);
1822                 }
1823
1824                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1825
1826                 BUG_ON(nr_pages < lvl_pages);
1827                 BUG_ON(sg_res < lvl_pages);
1828
1829                 nr_pages -= lvl_pages;
1830                 iov_pfn += lvl_pages;
1831                 phys_pfn += lvl_pages;
1832                 pteval += lvl_pages * VTD_PAGE_SIZE;
1833                 sg_res -= lvl_pages;
1834
1835                 /* If the next PTE would be the first in a new page, then we
1836                    need to flush the cache on the entries we've just written.
1837                    And then we'll need to recalculate 'pte', so clear it and
1838                    let it get set again in the if (!pte) block above.
1839
1840                    If we're done (!nr_pages) we need to flush the cache too.
1841
1842                    Also if we've been setting superpages, we may need to
1843                    recalculate 'pte' and switch back to smaller pages for the
1844                    end of the mapping, if the trailing size is not enough to
1845                    use another superpage (i.e. sg_res < lvl_pages). */
1846                 pte++;
1847                 if (!nr_pages || first_pte_in_page(pte) ||
1848                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1849                         domain_flush_cache(domain, first_pte,
1850                                            (void *)pte - (void *)first_pte);
1851                         pte = NULL;
1852                 }
1853
1854                 if (!sg_res && nr_pages)
1855                         sg = sg_next(sg);
1856         }
1857         return 0;
1858 }
1859
1860 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1861                                     struct scatterlist *sg, unsigned long nr_pages,
1862                                     int prot)
1863 {
1864         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1865 }
1866
1867 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1868                                      unsigned long phys_pfn, unsigned long nr_pages,
1869                                      int prot)
1870 {
1871         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1872 }
1873
1874 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1875 {
1876         if (!iommu)
1877                 return;
1878
1879         clear_context_table(iommu, bus, devfn);
1880         iommu->flush.flush_context(iommu, 0, 0, 0,
1881                                            DMA_CCMD_GLOBAL_INVL);
1882         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1883 }
1884
1885 static void domain_remove_dev_info(struct dmar_domain *domain)
1886 {
1887         struct device_domain_info *info;
1888         unsigned long flags;
1889         struct intel_iommu *iommu;
1890
1891         spin_lock_irqsave(&device_domain_lock, flags);
1892         while (!list_empty(&domain->devices)) {
1893                 info = list_entry(domain->devices.next,
1894                         struct device_domain_info, link);
1895                 list_del(&info->link);
1896                 list_del(&info->global);
1897                 if (info->dev)
1898                         info->dev->dev.archdata.iommu = NULL;
1899                 spin_unlock_irqrestore(&device_domain_lock, flags);
1900
1901                 iommu_disable_dev_iotlb(info);
1902                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1903                 iommu_detach_dev(iommu, info->bus, info->devfn);
1904                 free_devinfo_mem(info);
1905
1906                 spin_lock_irqsave(&device_domain_lock, flags);
1907         }
1908         spin_unlock_irqrestore(&device_domain_lock, flags);
1909 }
1910
1911 /*
1912  * find_domain
1913  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1914  */
1915 static struct dmar_domain *
1916 find_domain(struct pci_dev *pdev)
1917 {
1918         struct device_domain_info *info;
1919
1920         /* No lock here, assumes no domain exit in normal case */
1921         info = pdev->dev.archdata.iommu;
1922         if (info)
1923                 return info->domain;
1924         return NULL;
1925 }
1926
1927 /* domain is initialized */
1928 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1929 {
1930         struct dmar_domain *domain, *found = NULL;
1931         struct intel_iommu *iommu;
1932         struct dmar_drhd_unit *drhd;
1933         struct device_domain_info *info, *tmp;
1934         struct pci_dev *dev_tmp;
1935         unsigned long flags;
1936         int bus = 0, devfn = 0;
1937         int segment;
1938         int ret;
1939
1940         domain = find_domain(pdev);
1941         if (domain)
1942                 return domain;
1943
1944         segment = pci_domain_nr(pdev->bus);
1945
1946         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1947         if (dev_tmp) {
1948                 if (pci_is_pcie(dev_tmp)) {
1949                         bus = dev_tmp->subordinate->number;
1950                         devfn = 0;
1951                 } else {
1952                         bus = dev_tmp->bus->number;
1953                         devfn = dev_tmp->devfn;
1954                 }
1955                 spin_lock_irqsave(&device_domain_lock, flags);
1956                 list_for_each_entry(info, &device_domain_list, global) {
1957                         if (info->segment == segment &&
1958                             info->bus == bus && info->devfn == devfn) {
1959                                 found = info->domain;
1960                                 break;
1961                         }
1962                 }
1963                 spin_unlock_irqrestore(&device_domain_lock, flags);
1964                 /* pcie-pci bridge already has a domain, uses it */
1965                 if (found) {
1966                         domain = found;
1967                         goto found_domain;
1968                 }
1969         }
1970
1971         domain = alloc_domain();
1972         if (!domain)
1973                 goto error;
1974
1975         /* Allocate new domain for the device */
1976         drhd = dmar_find_matched_drhd_unit(pdev);
1977         if (!drhd) {
1978                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1979                         pci_name(pdev));
1980                 return NULL;
1981         }
1982         iommu = drhd->iommu;
1983
1984         ret = iommu_attach_domain(domain, iommu);
1985         if (ret) {
1986                 free_domain_mem(domain);
1987                 goto error;
1988         }
1989
1990         if (domain_init(domain, gaw)) {
1991                 domain_exit(domain);
1992                 goto error;
1993         }
1994
1995         /* register pcie-to-pci device */
1996         if (dev_tmp) {
1997                 info = alloc_devinfo_mem();
1998                 if (!info) {
1999                         domain_exit(domain);
2000                         goto error;
2001                 }
2002                 info->segment = segment;
2003                 info->bus = bus;
2004                 info->devfn = devfn;
2005                 info->dev = NULL;
2006                 info->domain = domain;
2007                 /* This domain is shared by devices under p2p bridge */
2008                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2009
2010                 /* pcie-to-pci bridge already has a domain, uses it */
2011                 found = NULL;
2012                 spin_lock_irqsave(&device_domain_lock, flags);
2013                 list_for_each_entry(tmp, &device_domain_list, global) {
2014                         if (tmp->segment == segment &&
2015                             tmp->bus == bus && tmp->devfn == devfn) {
2016                                 found = tmp->domain;
2017                                 break;
2018                         }
2019                 }
2020                 if (found) {
2021                         spin_unlock_irqrestore(&device_domain_lock, flags);
2022                         free_devinfo_mem(info);
2023                         domain_exit(domain);
2024                         domain = found;
2025                 } else {
2026                         list_add(&info->link, &domain->devices);
2027                         list_add(&info->global, &device_domain_list);
2028                         spin_unlock_irqrestore(&device_domain_lock, flags);
2029                 }
2030         }
2031
2032 found_domain:
2033         info = alloc_devinfo_mem();
2034         if (!info)
2035                 goto error;
2036         info->segment = segment;
2037         info->bus = pdev->bus->number;
2038         info->devfn = pdev->devfn;
2039         info->dev = pdev;
2040         info->domain = domain;
2041         spin_lock_irqsave(&device_domain_lock, flags);
2042         /* somebody is fast */
2043         found = find_domain(pdev);
2044         if (found != NULL) {
2045                 spin_unlock_irqrestore(&device_domain_lock, flags);
2046                 if (found != domain) {
2047                         domain_exit(domain);
2048                         domain = found;
2049                 }
2050                 free_devinfo_mem(info);
2051                 return domain;
2052         }
2053         list_add(&info->link, &domain->devices);
2054         list_add(&info->global, &device_domain_list);
2055         pdev->dev.archdata.iommu = info;
2056         spin_unlock_irqrestore(&device_domain_lock, flags);
2057         return domain;
2058 error:
2059         /* recheck it here, maybe others set it */
2060         return find_domain(pdev);
2061 }
2062
2063 static int iommu_identity_mapping;
2064 #define IDENTMAP_ALL            1
2065 #define IDENTMAP_GFX            2
2066 #define IDENTMAP_AZALIA         4
2067
2068 static int iommu_domain_identity_map(struct dmar_domain *domain,
2069                                      unsigned long long start,
2070                                      unsigned long long end)
2071 {
2072         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2073         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2074
2075         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2076                           dma_to_mm_pfn(last_vpfn))) {
2077                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2078                 return -ENOMEM;
2079         }
2080
2081         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2082                  start, end, domain->id);
2083         /*
2084          * RMRR range might have overlap with physical memory range,
2085          * clear it first
2086          */
2087         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2088
2089         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2090                                   last_vpfn - first_vpfn + 1,
2091                                   DMA_PTE_READ|DMA_PTE_WRITE);
2092 }
2093
2094 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2095                                       unsigned long long start,
2096                                       unsigned long long end)
2097 {
2098         struct dmar_domain *domain;
2099         int ret;
2100
2101         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2102         if (!domain)
2103                 return -ENOMEM;
2104
2105         /* For _hardware_ passthrough, don't bother. But for software
2106            passthrough, we do it anyway -- it may indicate a memory
2107            range which is reserved in E820, so which didn't get set
2108            up to start with in si_domain */
2109         if (domain == si_domain && hw_pass_through) {
2110                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2111                        pci_name(pdev), start, end);
2112                 return 0;
2113         }
2114
2115         printk(KERN_INFO
2116                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2117                pci_name(pdev), start, end);
2118         
2119         if (end < start) {
2120                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2121                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2122                         dmi_get_system_info(DMI_BIOS_VENDOR),
2123                         dmi_get_system_info(DMI_BIOS_VERSION),
2124                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2125                 ret = -EIO;
2126                 goto error;
2127         }
2128
2129         if (end >> agaw_to_width(domain->agaw)) {
2130                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2131                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2132                      agaw_to_width(domain->agaw),
2133                      dmi_get_system_info(DMI_BIOS_VENDOR),
2134                      dmi_get_system_info(DMI_BIOS_VERSION),
2135                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2136                 ret = -EIO;
2137                 goto error;
2138         }
2139
2140         ret = iommu_domain_identity_map(domain, start, end);
2141         if (ret)
2142                 goto error;
2143
2144         /* context entry init */
2145         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2146         if (ret)
2147                 goto error;
2148
2149         return 0;
2150
2151  error:
2152         domain_exit(domain);
2153         return ret;
2154 }
2155
2156 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2157         struct pci_dev *pdev)
2158 {
2159         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2160                 return 0;
2161         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2162                 rmrr->end_address);
2163 }
2164
2165 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2166 static inline void iommu_prepare_isa(void)
2167 {
2168         struct pci_dev *pdev;
2169         int ret;
2170
2171         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2172         if (!pdev)
2173                 return;
2174
2175         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2176         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2177
2178         if (ret)
2179                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2180                        "floppy might not work\n");
2181
2182 }
2183 #else
2184 static inline void iommu_prepare_isa(void)
2185 {
2186         return;
2187 }
2188 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2189
2190 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2191
2192 static int __init si_domain_init(int hw)
2193 {
2194         struct dmar_drhd_unit *drhd;
2195         struct intel_iommu *iommu;
2196         int nid, ret = 0;
2197
2198         si_domain = alloc_domain();
2199         if (!si_domain)
2200                 return -EFAULT;
2201
2202         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2203
2204         for_each_active_iommu(iommu, drhd) {
2205                 ret = iommu_attach_domain(si_domain, iommu);
2206                 if (ret) {
2207                         domain_exit(si_domain);
2208                         return -EFAULT;
2209                 }
2210         }
2211
2212         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2213                 domain_exit(si_domain);
2214                 return -EFAULT;
2215         }
2216
2217         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2218
2219         if (hw)
2220                 return 0;
2221
2222         for_each_online_node(nid) {
2223                 unsigned long start_pfn, end_pfn;
2224                 int i;
2225
2226                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2227                         ret = iommu_domain_identity_map(si_domain,
2228                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2229                         if (ret)
2230                                 return ret;
2231                 }
2232         }
2233
2234         return 0;
2235 }
2236
2237 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2238                                           struct pci_dev *pdev);
2239 static int identity_mapping(struct pci_dev *pdev)
2240 {
2241         struct device_domain_info *info;
2242
2243         if (likely(!iommu_identity_mapping))
2244                 return 0;
2245
2246         info = pdev->dev.archdata.iommu;
2247         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2248                 return (info->domain == si_domain);
2249
2250         return 0;
2251 }
2252
2253 static int domain_add_dev_info(struct dmar_domain *domain,
2254                                struct pci_dev *pdev,
2255                                int translation)
2256 {
2257         struct device_domain_info *info;
2258         unsigned long flags;
2259         int ret;
2260
2261         info = alloc_devinfo_mem();
2262         if (!info)
2263                 return -ENOMEM;
2264
2265         ret = domain_context_mapping(domain, pdev, translation);
2266         if (ret) {
2267                 free_devinfo_mem(info);
2268                 return ret;
2269         }
2270
2271         info->segment = pci_domain_nr(pdev->bus);
2272         info->bus = pdev->bus->number;
2273         info->devfn = pdev->devfn;
2274         info->dev = pdev;
2275         info->domain = domain;
2276
2277         spin_lock_irqsave(&device_domain_lock, flags);
2278         list_add(&info->link, &domain->devices);
2279         list_add(&info->global, &device_domain_list);
2280         pdev->dev.archdata.iommu = info;
2281         spin_unlock_irqrestore(&device_domain_lock, flags);
2282
2283         return 0;
2284 }
2285
2286 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2287 {
2288         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2289                 return 1;
2290
2291         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2292                 return 1;
2293
2294         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2295                 return 0;
2296
2297         /*
2298          * We want to start off with all devices in the 1:1 domain, and
2299          * take them out later if we find they can't access all of memory.
2300          *
2301          * However, we can't do this for PCI devices behind bridges,
2302          * because all PCI devices behind the same bridge will end up
2303          * with the same source-id on their transactions.
2304          *
2305          * Practically speaking, we can't change things around for these
2306          * devices at run-time, because we can't be sure there'll be no
2307          * DMA transactions in flight for any of their siblings.
2308          * 
2309          * So PCI devices (unless they're on the root bus) as well as
2310          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2311          * the 1:1 domain, just in _case_ one of their siblings turns out
2312          * not to be able to map all of memory.
2313          */
2314         if (!pci_is_pcie(pdev)) {
2315                 if (!pci_is_root_bus(pdev->bus))
2316                         return 0;
2317                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2318                         return 0;
2319         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2320                 return 0;
2321
2322         /* 
2323          * At boot time, we don't yet know if devices will be 64-bit capable.
2324          * Assume that they will -- if they turn out not to be, then we can 
2325          * take them out of the 1:1 domain later.
2326          */
2327         if (!startup) {
2328                 /*
2329                  * If the device's dma_mask is less than the system's memory
2330                  * size then this is not a candidate for identity mapping.
2331                  */
2332                 u64 dma_mask = pdev->dma_mask;
2333
2334                 if (pdev->dev.coherent_dma_mask &&
2335                     pdev->dev.coherent_dma_mask < dma_mask)
2336                         dma_mask = pdev->dev.coherent_dma_mask;
2337
2338                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2339         }
2340
2341         return 1;
2342 }
2343
2344 static int __init iommu_prepare_static_identity_mapping(int hw)
2345 {
2346         struct pci_dev *pdev = NULL;
2347         int ret;
2348
2349         ret = si_domain_init(hw);
2350         if (ret)
2351                 return -EFAULT;
2352
2353         for_each_pci_dev(pdev) {
2354                 /* Skip Host/PCI Bridge devices */
2355                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2356                         continue;
2357                 if (iommu_should_identity_map(pdev, 1)) {
2358                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2359                                hw ? "hardware" : "software", pci_name(pdev));
2360
2361                         ret = domain_add_dev_info(si_domain, pdev,
2362                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2363                                                      CONTEXT_TT_MULTI_LEVEL);
2364                         if (ret)
2365                                 return ret;
2366                 }
2367         }
2368
2369         return 0;
2370 }
2371
2372 static int __init init_dmars(void)
2373 {
2374         struct dmar_drhd_unit *drhd;
2375         struct dmar_rmrr_unit *rmrr;
2376         struct pci_dev *pdev;
2377         struct intel_iommu *iommu;
2378         int i, ret;
2379
2380         /*
2381          * for each drhd
2382          *    allocate root
2383          *    initialize and program root entry to not present
2384          * endfor
2385          */
2386         for_each_drhd_unit(drhd) {
2387                 g_num_of_iommus++;
2388                 /*
2389                  * lock not needed as this is only incremented in the single
2390                  * threaded kernel __init code path all other access are read
2391                  * only
2392                  */
2393         }
2394
2395         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2396                         GFP_KERNEL);
2397         if (!g_iommus) {
2398                 printk(KERN_ERR "Allocating global iommu array failed\n");
2399                 ret = -ENOMEM;
2400                 goto error;
2401         }
2402
2403         deferred_flush = kzalloc(g_num_of_iommus *
2404                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2405         if (!deferred_flush) {
2406                 ret = -ENOMEM;
2407                 goto error;
2408         }
2409
2410         for_each_drhd_unit(drhd) {
2411                 if (drhd->ignored)
2412                         continue;
2413
2414                 iommu = drhd->iommu;
2415                 g_iommus[iommu->seq_id] = iommu;
2416
2417                 ret = iommu_init_domains(iommu);
2418                 if (ret)
2419                         goto error;
2420
2421                 /*
2422                  * TBD:
2423                  * we could share the same root & context tables
2424                  * among all IOMMU's. Need to Split it later.
2425                  */
2426                 ret = iommu_alloc_root_entry(iommu);
2427                 if (ret) {
2428                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2429                         goto error;
2430                 }
2431                 if (!ecap_pass_through(iommu->ecap))
2432                         hw_pass_through = 0;
2433         }
2434
2435         /*
2436          * Start from the sane iommu hardware state.
2437          */
2438         for_each_drhd_unit(drhd) {
2439                 if (drhd->ignored)
2440                         continue;
2441
2442                 iommu = drhd->iommu;
2443
2444                 /*
2445                  * If the queued invalidation is already initialized by us
2446                  * (for example, while enabling interrupt-remapping) then
2447                  * we got the things already rolling from a sane state.
2448                  */
2449                 if (iommu->qi)
2450                         continue;
2451
2452                 /*
2453                  * Clear any previous faults.
2454                  */
2455                 dmar_fault(-1, iommu);
2456                 /*
2457                  * Disable queued invalidation if supported and already enabled
2458                  * before OS handover.
2459                  */
2460                 dmar_disable_qi(iommu);
2461         }
2462
2463         for_each_drhd_unit(drhd) {
2464                 if (drhd->ignored)
2465                         continue;
2466
2467                 iommu = drhd->iommu;
2468
2469                 if (dmar_enable_qi(iommu)) {
2470                         /*
2471                          * Queued Invalidate not enabled, use Register Based
2472                          * Invalidate
2473                          */
2474                         iommu->flush.flush_context = __iommu_flush_context;
2475                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2476                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2477                                "invalidation\n",
2478                                 iommu->seq_id,
2479                                (unsigned long long)drhd->reg_base_addr);
2480                 } else {
2481                         iommu->flush.flush_context = qi_flush_context;
2482                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2483                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2484                                "invalidation\n",
2485                                 iommu->seq_id,
2486                                (unsigned long long)drhd->reg_base_addr);
2487                 }
2488         }
2489
2490         if (iommu_pass_through)
2491                 iommu_identity_mapping |= IDENTMAP_ALL;
2492
2493 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2494         iommu_identity_mapping |= IDENTMAP_GFX;
2495 #endif
2496
2497         check_tylersburg_isoch();
2498
2499         /*
2500          * If pass through is not set or not enabled, setup context entries for
2501          * identity mappings for rmrr, gfx, and isa and may fall back to static
2502          * identity mapping if iommu_identity_mapping is set.
2503          */
2504         if (iommu_identity_mapping) {
2505                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2506                 if (ret) {
2507                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2508                         goto error;
2509                 }
2510         }
2511         /*
2512          * For each rmrr
2513          *   for each dev attached to rmrr
2514          *   do
2515          *     locate drhd for dev, alloc domain for dev
2516          *     allocate free domain
2517          *     allocate page table entries for rmrr
2518          *     if context not allocated for bus
2519          *           allocate and init context
2520          *           set present in root table for this bus
2521          *     init context with domain, translation etc
2522          *    endfor
2523          * endfor
2524          */
2525         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2526         for_each_rmrr_units(rmrr) {
2527                 for (i = 0; i < rmrr->devices_cnt; i++) {
2528                         pdev = rmrr->devices[i];
2529                         /*
2530                          * some BIOS lists non-exist devices in DMAR
2531                          * table.
2532                          */
2533                         if (!pdev)
2534                                 continue;
2535                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2536                         if (ret)
2537                                 printk(KERN_ERR
2538                                        "IOMMU: mapping reserved region failed\n");
2539                 }
2540         }
2541
2542         iommu_prepare_isa();
2543
2544         /*
2545          * for each drhd
2546          *   enable fault log
2547          *   global invalidate context cache
2548          *   global invalidate iotlb
2549          *   enable translation
2550          */
2551         for_each_drhd_unit(drhd) {
2552                 if (drhd->ignored) {
2553                         /*
2554                          * we always have to disable PMRs or DMA may fail on
2555                          * this device
2556                          */
2557                         if (force_on)
2558                                 iommu_disable_protect_mem_regions(drhd->iommu);
2559                         continue;
2560                 }
2561                 iommu = drhd->iommu;
2562
2563                 iommu_flush_write_buffer(iommu);
2564
2565                 ret = dmar_set_interrupt(iommu);
2566                 if (ret)
2567                         goto error;
2568
2569                 iommu_set_root_entry(iommu);
2570
2571                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2572                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2573
2574                 ret = iommu_enable_translation(iommu);
2575                 if (ret)
2576                         goto error;
2577
2578                 iommu_disable_protect_mem_regions(iommu);
2579         }
2580
2581         return 0;
2582 error:
2583         for_each_drhd_unit(drhd) {
2584                 if (drhd->ignored)
2585                         continue;
2586                 iommu = drhd->iommu;
2587                 free_iommu(iommu);
2588         }
2589         kfree(g_iommus);
2590         return ret;
2591 }
2592
2593 /* This takes a number of _MM_ pages, not VTD pages */
2594 static struct iova *intel_alloc_iova(struct device *dev,
2595                                      struct dmar_domain *domain,
2596                                      unsigned long nrpages, uint64_t dma_mask)
2597 {
2598         struct pci_dev *pdev = to_pci_dev(dev);
2599         struct iova *iova = NULL;
2600
2601         /* Restrict dma_mask to the width that the iommu can handle */
2602         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2603
2604         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2605                 /*
2606                  * First try to allocate an io virtual address in
2607                  * DMA_BIT_MASK(32) and if that fails then try allocating
2608                  * from higher range
2609                  */
2610                 iova = alloc_iova(&domain->iovad, nrpages,
2611                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2612                 if (iova)
2613                         return iova;
2614         }
2615         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2616         if (unlikely(!iova)) {
2617                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2618                        nrpages, pci_name(pdev));
2619                 return NULL;
2620         }
2621
2622         return iova;
2623 }
2624
2625 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2626 {
2627         struct dmar_domain *domain;
2628         int ret;
2629
2630         domain = get_domain_for_dev(pdev,
2631                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2632         if (!domain) {
2633                 printk(KERN_ERR
2634                         "Allocating domain for %s failed", pci_name(pdev));
2635                 return NULL;
2636         }
2637
2638         /* make sure context mapping is ok */
2639         if (unlikely(!domain_context_mapped(pdev))) {
2640                 ret = domain_context_mapping(domain, pdev,
2641                                              CONTEXT_TT_MULTI_LEVEL);
2642                 if (ret) {
2643                         printk(KERN_ERR
2644                                 "Domain context map for %s failed",
2645                                 pci_name(pdev));
2646                         return NULL;
2647                 }
2648         }
2649
2650         return domain;
2651 }
2652
2653 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2654 {
2655         struct device_domain_info *info;
2656
2657         /* No lock here, assumes no domain exit in normal case */
2658         info = dev->dev.archdata.iommu;
2659         if (likely(info))
2660                 return info->domain;
2661
2662         return __get_valid_domain_for_dev(dev);
2663 }
2664
2665 static int iommu_dummy(struct pci_dev *pdev)
2666 {
2667         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2668 }
2669
2670 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2671 static int iommu_no_mapping(struct device *dev)
2672 {
2673         struct pci_dev *pdev;
2674         int found;
2675
2676         if (unlikely(dev->bus != &pci_bus_type))
2677                 return 1;
2678
2679         pdev = to_pci_dev(dev);
2680         if (iommu_dummy(pdev))
2681                 return 1;
2682
2683         if (!iommu_identity_mapping)
2684                 return 0;
2685
2686         found = identity_mapping(pdev);
2687         if (found) {
2688                 if (iommu_should_identity_map(pdev, 0))
2689                         return 1;
2690                 else {
2691                         /*
2692                          * 32 bit DMA is removed from si_domain and fall back
2693                          * to non-identity mapping.
2694                          */
2695                         domain_remove_one_dev_info(si_domain, pdev);
2696                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2697                                pci_name(pdev));
2698                         return 0;
2699                 }
2700         } else {
2701                 /*
2702                  * In case of a detached 64 bit DMA device from vm, the device
2703                  * is put into si_domain for identity mapping.
2704                  */
2705                 if (iommu_should_identity_map(pdev, 0)) {
2706                         int ret;
2707                         ret = domain_add_dev_info(si_domain, pdev,
2708                                                   hw_pass_through ?
2709                                                   CONTEXT_TT_PASS_THROUGH :
2710                                                   CONTEXT_TT_MULTI_LEVEL);
2711                         if (!ret) {
2712                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2713                                        pci_name(pdev));
2714                                 return 1;
2715                         }
2716                 }
2717         }
2718
2719         return 0;
2720 }
2721
2722 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2723                                      size_t size, int dir, u64 dma_mask)
2724 {
2725         struct pci_dev *pdev = to_pci_dev(hwdev);
2726         struct dmar_domain *domain;
2727         phys_addr_t start_paddr;
2728         struct iova *iova;
2729         int prot = 0;
2730         int ret;
2731         struct intel_iommu *iommu;
2732         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2733
2734         BUG_ON(dir == DMA_NONE);
2735
2736         if (iommu_no_mapping(hwdev))
2737                 return paddr;
2738
2739         domain = get_valid_domain_for_dev(pdev);
2740         if (!domain)
2741                 return 0;
2742
2743         iommu = domain_get_iommu(domain);
2744         size = aligned_nrpages(paddr, size);
2745
2746         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2747         if (!iova)
2748                 goto error;
2749
2750         /*
2751          * Check if DMAR supports zero-length reads on write only
2752          * mappings..
2753          */
2754         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2755                         !cap_zlr(iommu->cap))
2756                 prot |= DMA_PTE_READ;
2757         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2758                 prot |= DMA_PTE_WRITE;
2759         /*
2760          * paddr - (paddr + size) might be partial page, we should map the whole
2761          * page.  Note: if two part of one page are separately mapped, we
2762          * might have two guest_addr mapping to the same host paddr, but this
2763          * is not a big problem
2764          */
2765         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2766                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2767         if (ret)
2768                 goto error;
2769
2770         /* it's a non-present to present mapping. Only flush if caching mode */
2771         if (cap_caching_mode(iommu->cap))
2772                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2773         else
2774                 iommu_flush_write_buffer(iommu);
2775
2776         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2777         start_paddr += paddr & ~PAGE_MASK;
2778         return start_paddr;
2779
2780 error:
2781         if (iova)
2782                 __free_iova(&domain->iovad, iova);
2783         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2784                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2785         return 0;
2786 }
2787
2788 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2789                                  unsigned long offset, size_t size,
2790                                  enum dma_data_direction dir,
2791                                  struct dma_attrs *attrs)
2792 {
2793         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2794                                   dir, to_pci_dev(dev)->dma_mask);
2795 }
2796
2797 static void flush_unmaps(void)
2798 {
2799         int i, j;
2800
2801         timer_on = 0;
2802
2803         /* just flush them all */
2804         for (i = 0; i < g_num_of_iommus; i++) {
2805                 struct intel_iommu *iommu = g_iommus[i];
2806                 if (!iommu)
2807                         continue;
2808
2809                 if (!deferred_flush[i].next)
2810                         continue;
2811
2812                 /* In caching mode, global flushes turn emulation expensive */
2813                 if (!cap_caching_mode(iommu->cap))
2814                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2815                                          DMA_TLB_GLOBAL_FLUSH);
2816                 for (j = 0; j < deferred_flush[i].next; j++) {
2817                         unsigned long mask;
2818                         struct iova *iova = deferred_flush[i].iova[j];
2819                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2820
2821                         /* On real hardware multiple invalidations are expensive */
2822                         if (cap_caching_mode(iommu->cap))
2823                                 iommu_flush_iotlb_psi(iommu, domain->id,
2824                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2825                         else {
2826                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2827                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2828                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2829                         }
2830                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2831                 }
2832                 deferred_flush[i].next = 0;
2833         }
2834
2835         list_size = 0;
2836 }
2837
2838 static void flush_unmaps_timeout(unsigned long data)
2839 {
2840         unsigned long flags;
2841
2842         spin_lock_irqsave(&async_umap_flush_lock, flags);
2843         flush_unmaps();
2844         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2845 }
2846
2847 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2848 {
2849         unsigned long flags;
2850         int next, iommu_id;
2851         struct intel_iommu *iommu;
2852
2853         spin_lock_irqsave(&async_umap_flush_lock, flags);
2854         if (list_size == HIGH_WATER_MARK)
2855                 flush_unmaps();
2856
2857         iommu = domain_get_iommu(dom);
2858         iommu_id = iommu->seq_id;
2859
2860         next = deferred_flush[iommu_id].next;
2861         deferred_flush[iommu_id].domain[next] = dom;
2862         deferred_flush[iommu_id].iova[next] = iova;
2863         deferred_flush[iommu_id].next++;
2864
2865         if (!timer_on) {
2866                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2867                 timer_on = 1;
2868         }
2869         list_size++;
2870         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2871 }
2872
2873 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2874                              size_t size, enum dma_data_direction dir,
2875                              struct dma_attrs *attrs)
2876 {
2877         struct pci_dev *pdev = to_pci_dev(dev);
2878         struct dmar_domain *domain;
2879         unsigned long start_pfn, last_pfn;
2880         struct iova *iova;
2881         struct intel_iommu *iommu;
2882
2883         if (iommu_no_mapping(dev))
2884                 return;
2885
2886         domain = find_domain(pdev);
2887         BUG_ON(!domain);
2888
2889         iommu = domain_get_iommu(domain);
2890
2891         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2892         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2893                       (unsigned long long)dev_addr))
2894                 return;
2895
2896         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2897         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2898
2899         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2900                  pci_name(pdev), start_pfn, last_pfn);
2901
2902         /*  clear the whole page */
2903         dma_pte_clear_range(domain, start_pfn, last_pfn);
2904
2905         /* free page tables */
2906         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2907
2908         if (intel_iommu_strict) {
2909                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2910                                       last_pfn - start_pfn + 1, 0);
2911                 /* free iova */
2912                 __free_iova(&domain->iovad, iova);
2913         } else {
2914                 add_unmap(domain, iova);
2915                 /*
2916                  * queue up the release of the unmap to save the 1/6th of the
2917                  * cpu used up by the iotlb flush operation...
2918                  */
2919         }
2920 }
2921
2922 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2923                                   dma_addr_t *dma_handle, gfp_t flags)
2924 {
2925         void *vaddr;
2926         int order;
2927
2928         size = PAGE_ALIGN(size);
2929         order = get_order(size);
2930
2931         if (!iommu_no_mapping(hwdev))
2932                 flags &= ~(GFP_DMA | GFP_DMA32);
2933         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2934                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2935                         flags |= GFP_DMA;
2936                 else
2937                         flags |= GFP_DMA32;
2938         }
2939
2940         vaddr = (void *)__get_free_pages(flags, order);
2941         if (!vaddr)
2942                 return NULL;
2943         memset(vaddr, 0, size);
2944
2945         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2946                                          DMA_BIDIRECTIONAL,
2947                                          hwdev->coherent_dma_mask);
2948         if (*dma_handle)
2949                 return vaddr;
2950         free_pages((unsigned long)vaddr, order);
2951         return NULL;
2952 }
2953
2954 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2955                                 dma_addr_t dma_handle)
2956 {
2957         int order;
2958
2959         size = PAGE_ALIGN(size);
2960         order = get_order(size);
2961
2962         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2963         free_pages((unsigned long)vaddr, order);
2964 }
2965
2966 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2967                            int nelems, enum dma_data_direction dir,
2968                            struct dma_attrs *attrs)
2969 {
2970         struct pci_dev *pdev = to_pci_dev(hwdev);
2971         struct dmar_domain *domain;
2972         unsigned long start_pfn, last_pfn;
2973         struct iova *iova;
2974         struct intel_iommu *iommu;
2975
2976         if (iommu_no_mapping(hwdev))
2977                 return;
2978
2979         domain = find_domain(pdev);
2980         BUG_ON(!domain);
2981
2982         iommu = domain_get_iommu(domain);
2983
2984         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2985         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2986                       (unsigned long long)sglist[0].dma_address))
2987                 return;
2988
2989         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2990         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2991
2992         /*  clear the whole page */
2993         dma_pte_clear_range(domain, start_pfn, last_pfn);
2994
2995         /* free page tables */
2996         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2997
2998         if (intel_iommu_strict) {
2999                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3000                                       last_pfn - start_pfn + 1, 0);
3001                 /* free iova */
3002                 __free_iova(&domain->iovad, iova);
3003         } else {
3004                 add_unmap(domain, iova);
3005                 /*
3006                  * queue up the release of the unmap to save the 1/6th of the
3007                  * cpu used up by the iotlb flush operation...
3008                  */
3009         }
3010 }
3011
3012 static int intel_nontranslate_map_sg(struct device *hddev,
3013         struct scatterlist *sglist, int nelems, int dir)
3014 {
3015         int i;
3016         struct scatterlist *sg;
3017
3018         for_each_sg(sglist, sg, nelems, i) {
3019                 BUG_ON(!sg_page(sg));
3020                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3021                 sg->dma_length = sg->length;
3022         }
3023         return nelems;
3024 }
3025
3026 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3027                         enum dma_data_direction dir, struct dma_attrs *attrs)
3028 {
3029         int i;
3030         struct pci_dev *pdev = to_pci_dev(hwdev);
3031         struct dmar_domain *domain;
3032         size_t size = 0;
3033         int prot = 0;
3034         struct iova *iova = NULL;
3035         int ret;
3036         struct scatterlist *sg;
3037         unsigned long start_vpfn;
3038         struct intel_iommu *iommu;
3039
3040         BUG_ON(dir == DMA_NONE);
3041         if (iommu_no_mapping(hwdev))
3042                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3043
3044         domain = get_valid_domain_for_dev(pdev);
3045         if (!domain)
3046                 return 0;
3047
3048         iommu = domain_get_iommu(domain);
3049
3050         for_each_sg(sglist, sg, nelems, i)
3051                 size += aligned_nrpages(sg->offset, sg->length);
3052
3053         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3054                                 pdev->dma_mask);
3055         if (!iova) {
3056                 sglist->dma_length = 0;
3057                 return 0;
3058         }
3059
3060         /*
3061          * Check if DMAR supports zero-length reads on write only
3062          * mappings..
3063          */
3064         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3065                         !cap_zlr(iommu->cap))
3066                 prot |= DMA_PTE_READ;
3067         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3068                 prot |= DMA_PTE_WRITE;
3069
3070         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3071
3072         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3073         if (unlikely(ret)) {
3074                 /*  clear the page */
3075                 dma_pte_clear_range(domain, start_vpfn,
3076                                     start_vpfn + size - 1);
3077                 /* free page tables */
3078                 dma_pte_free_pagetable(domain, start_vpfn,
3079                                        start_vpfn + size - 1);
3080                 /* free iova */
3081                 __free_iova(&domain->iovad, iova);
3082                 return 0;
3083         }
3084
3085         /* it's a non-present to present mapping. Only flush if caching mode */
3086         if (cap_caching_mode(iommu->cap))
3087                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3088         else
3089                 iommu_flush_write_buffer(iommu);
3090
3091         return nelems;
3092 }
3093
3094 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3095 {
3096         return !dma_addr;
3097 }
3098
3099 struct dma_map_ops intel_dma_ops = {
3100         .alloc_coherent = intel_alloc_coherent,
3101         .free_coherent = intel_free_coherent,
3102         .map_sg = intel_map_sg,
3103         .unmap_sg = intel_unmap_sg,
3104         .map_page = intel_map_page,
3105         .unmap_page = intel_unmap_page,
3106         .mapping_error = intel_mapping_error,
3107 };
3108
3109 static inline int iommu_domain_cache_init(void)
3110 {
3111         int ret = 0;
3112
3113         iommu_domain_cache = kmem_cache_create("iommu_domain",
3114                                          sizeof(struct dmar_domain),
3115                                          0,
3116                                          SLAB_HWCACHE_ALIGN,
3117
3118                                          NULL);
3119         if (!iommu_domain_cache) {
3120                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3121                 ret = -ENOMEM;
3122         }
3123
3124         return ret;
3125 }
3126
3127 static inline int iommu_devinfo_cache_init(void)
3128 {
3129         int ret = 0;
3130
3131         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3132                                          sizeof(struct device_domain_info),
3133                                          0,
3134                                          SLAB_HWCACHE_ALIGN,
3135                                          NULL);
3136         if (!iommu_devinfo_cache) {
3137                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3138                 ret = -ENOMEM;
3139         }
3140
3141         return ret;
3142 }
3143
3144 static inline int iommu_iova_cache_init(void)
3145 {
3146         int ret = 0;
3147
3148         iommu_iova_cache = kmem_cache_create("iommu_iova",
3149                                          sizeof(struct iova),
3150                                          0,
3151                                          SLAB_HWCACHE_ALIGN,
3152                                          NULL);
3153         if (!iommu_iova_cache) {
3154                 printk(KERN_ERR "Couldn't create iova cache\n");
3155                 ret = -ENOMEM;
3156         }
3157
3158         return ret;
3159 }
3160
3161 static int __init iommu_init_mempool(void)
3162 {
3163         int ret;
3164         ret = iommu_iova_cache_init();
3165         if (ret)
3166                 return ret;
3167
3168         ret = iommu_domain_cache_init();
3169         if (ret)
3170                 goto domain_error;
3171
3172         ret = iommu_devinfo_cache_init();
3173         if (!ret)
3174                 return ret;
3175
3176         kmem_cache_destroy(iommu_domain_cache);
3177 domain_error:
3178         kmem_cache_destroy(iommu_iova_cache);
3179
3180         return -ENOMEM;
3181 }
3182
3183 static void __init iommu_exit_mempool(void)
3184 {
3185         kmem_cache_destroy(iommu_devinfo_cache);
3186         kmem_cache_destroy(iommu_domain_cache);
3187         kmem_cache_destroy(iommu_iova_cache);
3188
3189 }
3190
3191 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3192 {
3193         struct dmar_drhd_unit *drhd;
3194         u32 vtbar;
3195         int rc;
3196
3197         /* We know that this device on this chipset has its own IOMMU.
3198          * If we find it under a different IOMMU, then the BIOS is lying
3199          * to us. Hope that the IOMMU for this device is actually
3200          * disabled, and it needs no translation...
3201          */
3202         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3203         if (rc) {
3204                 /* "can't" happen */
3205                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3206                 return;
3207         }
3208         vtbar &= 0xffff0000;
3209
3210         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3211         drhd = dmar_find_matched_drhd_unit(pdev);
3212         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3213                             TAINT_FIRMWARE_WORKAROUND,
3214                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3215                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3216 }
3217 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3218
3219 static void __init init_no_remapping_devices(void)
3220 {
3221         struct dmar_drhd_unit *drhd;
3222
3223         for_each_drhd_unit(drhd) {
3224                 if (!drhd->include_all) {
3225                         int i;
3226                         for (i = 0; i < drhd->devices_cnt; i++)
3227                                 if (drhd->devices[i] != NULL)
3228                                         break;
3229                         /* ignore DMAR unit if no pci devices exist */
3230                         if (i == drhd->devices_cnt)
3231                                 drhd->ignored = 1;
3232                 }
3233         }
3234
3235         for_each_drhd_unit(drhd) {
3236                 int i;
3237                 if (drhd->ignored || drhd->include_all)
3238                         continue;
3239
3240                 for (i = 0; i < drhd->devices_cnt; i++)
3241                         if (drhd->devices[i] &&
3242                             !IS_GFX_DEVICE(drhd->devices[i]))
3243                                 break;
3244
3245                 if (i < drhd->devices_cnt)
3246                         continue;
3247
3248                 /* This IOMMU has *only* gfx devices. Either bypass it or
3249                    set the gfx_mapped flag, as appropriate */
3250                 if (dmar_map_gfx) {
3251                         intel_iommu_gfx_mapped = 1;
3252                 } else {
3253                         drhd->ignored = 1;
3254                         for (i = 0; i < drhd->devices_cnt; i++) {
3255                                 if (!drhd->devices[i])
3256                                         continue;
3257                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3258                         }
3259                 }
3260         }
3261 }
3262
3263 #ifdef CONFIG_SUSPEND
3264 static int init_iommu_hw(void)
3265 {
3266         struct dmar_drhd_unit *drhd;
3267         struct intel_iommu *iommu = NULL;
3268
3269         for_each_active_iommu(iommu, drhd)
3270                 if (iommu->qi)
3271                         dmar_reenable_qi(iommu);
3272
3273         for_each_iommu(iommu, drhd) {
3274                 if (drhd->ignored) {
3275                         /*
3276                          * we always have to disable PMRs or DMA may fail on
3277                          * this device
3278                          */
3279                         if (force_on)
3280                                 iommu_disable_protect_mem_regions(iommu);
3281                         continue;
3282                 }
3283         
3284                 iommu_flush_write_buffer(iommu);
3285
3286                 iommu_set_root_entry(iommu);
3287
3288                 iommu->flush.flush_context(iommu, 0, 0, 0,
3289                                            DMA_CCMD_GLOBAL_INVL);
3290                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3291                                          DMA_TLB_GLOBAL_FLUSH);
3292                 if (iommu_enable_translation(iommu))
3293                         return 1;
3294                 iommu_disable_protect_mem_regions(iommu);
3295         }
3296
3297         return 0;
3298 }
3299
3300 static void iommu_flush_all(void)
3301 {
3302         struct dmar_drhd_unit *drhd;
3303         struct intel_iommu *iommu;
3304
3305         for_each_active_iommu(iommu, drhd) {
3306                 iommu->flush.flush_context(iommu, 0, 0, 0,
3307                                            DMA_CCMD_GLOBAL_INVL);
3308                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3309                                          DMA_TLB_GLOBAL_FLUSH);
3310         }
3311 }
3312
3313 static int iommu_suspend(void)
3314 {
3315         struct dmar_drhd_unit *drhd;
3316         struct intel_iommu *iommu = NULL;
3317         unsigned long flag;
3318
3319         for_each_active_iommu(iommu, drhd) {
3320                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3321                                                  GFP_ATOMIC);
3322                 if (!iommu->iommu_state)
3323                         goto nomem;
3324         }
3325
3326         iommu_flush_all();
3327
3328         for_each_active_iommu(iommu, drhd) {
3329                 iommu_disable_translation(iommu);
3330
3331                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3332
3333                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3334                         readl(iommu->reg + DMAR_FECTL_REG);
3335                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3336                         readl(iommu->reg + DMAR_FEDATA_REG);
3337                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3338                         readl(iommu->reg + DMAR_FEADDR_REG);
3339                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3340                         readl(iommu->reg + DMAR_FEUADDR_REG);
3341
3342                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3343         }
3344         return 0;
3345
3346 nomem:
3347         for_each_active_iommu(iommu, drhd)
3348                 kfree(iommu->iommu_state);
3349
3350         return -ENOMEM;
3351 }
3352
3353 static void iommu_resume(void)
3354 {
3355         struct dmar_drhd_unit *drhd;
3356         struct intel_iommu *iommu = NULL;
3357         unsigned long flag;
3358
3359         if (init_iommu_hw()) {
3360                 if (force_on)
3361                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3362                 else
3363                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3364                 return;
3365         }
3366
3367         for_each_active_iommu(iommu, drhd) {
3368
3369                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3370
3371                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3372                         iommu->reg + DMAR_FECTL_REG);
3373                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3374                         iommu->reg + DMAR_FEDATA_REG);
3375                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3376                         iommu->reg + DMAR_FEADDR_REG);
3377                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3378                         iommu->reg + DMAR_FEUADDR_REG);
3379
3380                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3381         }
3382
3383         for_each_active_iommu(iommu, drhd)
3384                 kfree(iommu->iommu_state);
3385 }
3386
3387 static struct syscore_ops iommu_syscore_ops = {
3388         .resume         = iommu_resume,
3389         .suspend        = iommu_suspend,
3390 };
3391
3392 static void __init init_iommu_pm_ops(void)
3393 {
3394         register_syscore_ops(&iommu_syscore_ops);
3395 }
3396
3397 #else
3398 static inline void init_iommu_pm_ops(void) {}
3399 #endif  /* CONFIG_PM */
3400
3401 LIST_HEAD(dmar_rmrr_units);
3402
3403 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3404 {
3405         list_add(&rmrr->list, &dmar_rmrr_units);
3406 }
3407
3408
3409 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3410 {
3411         struct acpi_dmar_reserved_memory *rmrr;
3412         struct dmar_rmrr_unit *rmrru;
3413
3414         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3415         if (!rmrru)
3416                 return -ENOMEM;
3417
3418         rmrru->hdr = header;
3419         rmrr = (struct acpi_dmar_reserved_memory *)header;
3420         rmrru->base_address = rmrr->base_address;
3421         rmrru->end_address = rmrr->end_address;
3422
3423         dmar_register_rmrr_unit(rmrru);
3424         return 0;
3425 }
3426
3427 static int __init
3428 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3429 {
3430         struct acpi_dmar_reserved_memory *rmrr;
3431         int ret;
3432
3433         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3434         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3435                 ((void *)rmrr) + rmrr->header.length,
3436                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3437
3438         if (ret || (rmrru->devices_cnt == 0)) {
3439                 list_del(&rmrru->list);
3440                 kfree(rmrru);
3441         }
3442         return ret;
3443 }
3444
3445 static LIST_HEAD(dmar_atsr_units);
3446
3447 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3448 {
3449         struct acpi_dmar_atsr *atsr;
3450         struct dmar_atsr_unit *atsru;
3451
3452         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3453         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3454         if (!atsru)
3455                 return -ENOMEM;
3456
3457         atsru->hdr = hdr;
3458         atsru->include_all = atsr->flags & 0x1;
3459
3460         list_add(&atsru->list, &dmar_atsr_units);
3461
3462         return 0;
3463 }
3464
3465 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3466 {
3467         int rc;
3468         struct acpi_dmar_atsr *atsr;
3469
3470         if (atsru->include_all)
3471                 return 0;
3472
3473         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3474         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3475                                 (void *)atsr + atsr->header.length,
3476                                 &atsru->devices_cnt, &atsru->devices,
3477                                 atsr->segment);
3478         if (rc || !atsru->devices_cnt) {
3479                 list_del(&atsru->list);
3480                 kfree(atsru);
3481         }
3482
3483         return rc;
3484 }
3485
3486 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3487 {
3488         int i;
3489         struct pci_bus *bus;
3490         struct acpi_dmar_atsr *atsr;
3491         struct dmar_atsr_unit *atsru;
3492
3493         dev = pci_physfn(dev);
3494
3495         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3496                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3497                 if (atsr->segment == pci_domain_nr(dev->bus))
3498                         goto found;
3499         }
3500
3501         return 0;
3502
3503 found:
3504         for (bus = dev->bus; bus; bus = bus->parent) {
3505                 struct pci_dev *bridge = bus->self;
3506
3507                 if (!bridge || !pci_is_pcie(bridge) ||
3508                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3509                         return 0;
3510
3511                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3512                         for (i = 0; i < atsru->devices_cnt; i++)
3513                                 if (atsru->devices[i] == bridge)
3514                                         return 1;
3515                         break;
3516                 }
3517         }
3518
3519         if (atsru->include_all)
3520                 return 1;
3521
3522         return 0;
3523 }
3524
3525 int __init dmar_parse_rmrr_atsr_dev(void)
3526 {
3527         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3528         struct dmar_atsr_unit *atsr, *atsr_n;
3529         int ret = 0;
3530
3531         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3532                 ret = rmrr_parse_dev(rmrr);
3533                 if (ret)
3534                         return ret;
3535         }
3536
3537         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3538                 ret = atsr_parse_dev(atsr);
3539                 if (ret)
3540                         return ret;
3541         }
3542
3543         return ret;
3544 }
3545
3546 /*
3547  * Here we only respond to action of unbound device from driver.
3548  *
3549  * Added device is not attached to its DMAR domain here yet. That will happen
3550  * when mapping the device to iova.
3551  */
3552 static int device_notifier(struct notifier_block *nb,
3553                                   unsigned long action, void *data)
3554 {
3555         struct device *dev = data;
3556         struct pci_dev *pdev = to_pci_dev(dev);
3557         struct dmar_domain *domain;
3558
3559         if (iommu_no_mapping(dev))
3560                 return 0;
3561
3562         domain = find_domain(pdev);
3563         if (!domain)
3564                 return 0;
3565
3566         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3567                 domain_remove_one_dev_info(domain, pdev);
3568
3569                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3570                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3571                     list_empty(&domain->devices))
3572                         domain_exit(domain);
3573         }
3574
3575         return 0;
3576 }
3577
3578 static struct notifier_block device_nb = {
3579         .notifier_call = device_notifier,
3580 };
3581
3582 int __init intel_iommu_init(void)
3583 {
3584         int ret = 0;
3585
3586         /* VT-d is required for a TXT/tboot launch, so enforce that */
3587         force_on = tboot_force_iommu();
3588
3589         if (dmar_table_init()) {
3590                 if (force_on)
3591                         panic("tboot: Failed to initialize DMAR table\n");
3592                 return  -ENODEV;
3593         }
3594
3595         if (dmar_dev_scope_init() < 0) {
3596                 if (force_on)
3597                         panic("tboot: Failed to initialize DMAR device scope\n");
3598                 return  -ENODEV;
3599         }
3600
3601         if (no_iommu || dmar_disabled)
3602                 return -ENODEV;
3603
3604         if (iommu_init_mempool()) {
3605                 if (force_on)
3606                         panic("tboot: Failed to initialize iommu memory\n");
3607                 return  -ENODEV;
3608         }
3609
3610         if (list_empty(&dmar_rmrr_units))
3611                 printk(KERN_INFO "DMAR: No RMRR found\n");
3612
3613         if (list_empty(&dmar_atsr_units))
3614                 printk(KERN_INFO "DMAR: No ATSR found\n");
3615
3616         if (dmar_init_reserved_ranges()) {
3617                 if (force_on)
3618                         panic("tboot: Failed to reserve iommu ranges\n");
3619                 return  -ENODEV;
3620         }
3621
3622         init_no_remapping_devices();
3623
3624         ret = init_dmars();
3625         if (ret) {
3626                 if (force_on)
3627                         panic("tboot: Failed to initialize DMARs\n");
3628                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3629                 put_iova_domain(&reserved_iova_list);
3630                 iommu_exit_mempool();
3631                 return ret;
3632         }
3633         printk(KERN_INFO
3634         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3635
3636         init_timer(&unmap_timer);
3637 #ifdef CONFIG_SWIOTLB
3638         swiotlb = 0;
3639 #endif
3640         dma_ops = &intel_dma_ops;
3641
3642         init_iommu_pm_ops();
3643
3644         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3645
3646         bus_register_notifier(&pci_bus_type, &device_nb);
3647
3648         intel_iommu_enabled = 1;
3649
3650         return 0;
3651 }
3652
3653 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3654                                            struct pci_dev *pdev)
3655 {
3656         struct pci_dev *tmp, *parent;
3657
3658         if (!iommu || !pdev)
3659                 return;
3660
3661         /* dependent device detach */
3662         tmp = pci_find_upstream_pcie_bridge(pdev);
3663         /* Secondary interface's bus number and devfn 0 */
3664         if (tmp) {
3665                 parent = pdev->bus->self;
3666                 while (parent != tmp) {
3667                         iommu_detach_dev(iommu, parent->bus->number,
3668                                          parent->devfn);
3669                         parent = parent->bus->self;
3670                 }
3671                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3672                         iommu_detach_dev(iommu,
3673                                 tmp->subordinate->number, 0);
3674                 else /* this is a legacy PCI bridge */
3675                         iommu_detach_dev(iommu, tmp->bus->number,
3676                                          tmp->devfn);
3677         }
3678 }
3679
3680 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3681                                           struct pci_dev *pdev)
3682 {
3683         struct device_domain_info *info;
3684         struct intel_iommu *iommu;
3685         unsigned long flags;
3686         int found = 0;
3687         struct list_head *entry, *tmp;
3688
3689         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3690                                 pdev->devfn);
3691         if (!iommu)
3692                 return;
3693
3694         spin_lock_irqsave(&device_domain_lock, flags);
3695         list_for_each_safe(entry, tmp, &domain->devices) {
3696                 info = list_entry(entry, struct device_domain_info, link);
3697                 if (info->segment == pci_domain_nr(pdev->bus) &&
3698                     info->bus == pdev->bus->number &&
3699                     info->devfn == pdev->devfn) {
3700                         list_del(&info->link);
3701                         list_del(&info->global);
3702                         if (info->dev)
3703                                 info->dev->dev.archdata.iommu = NULL;
3704                         spin_unlock_irqrestore(&device_domain_lock, flags);
3705
3706                         iommu_disable_dev_iotlb(info);
3707                         iommu_detach_dev(iommu, info->bus, info->devfn);
3708                         iommu_detach_dependent_devices(iommu, pdev);
3709                         free_devinfo_mem(info);
3710
3711                         spin_lock_irqsave(&device_domain_lock, flags);
3712
3713                         if (found)
3714                                 break;
3715                         else
3716                                 continue;
3717                 }
3718
3719                 /* if there is no other devices under the same iommu
3720                  * owned by this domain, clear this iommu in iommu_bmp
3721                  * update iommu count and coherency
3722                  */
3723                 if (iommu == device_to_iommu(info->segment, info->bus,
3724                                             info->devfn))
3725                         found = 1;
3726         }
3727
3728         spin_unlock_irqrestore(&device_domain_lock, flags);
3729
3730         if (found == 0) {
3731                 unsigned long tmp_flags;
3732                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3733                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3734                 domain->iommu_count--;
3735                 domain_update_iommu_cap(domain);
3736                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3737
3738                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3739                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3740                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3741                         clear_bit(domain->id, iommu->domain_ids);
3742                         iommu->domains[domain->id] = NULL;
3743                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3744                 }
3745         }
3746 }
3747
3748 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3749 {
3750         struct device_domain_info *info;
3751         struct intel_iommu *iommu;
3752         unsigned long flags1, flags2;
3753
3754         spin_lock_irqsave(&device_domain_lock, flags1);
3755         while (!list_empty(&domain->devices)) {
3756                 info = list_entry(domain->devices.next,
3757                         struct device_domain_info, link);
3758                 list_del(&info->link);
3759                 list_del(&info->global);
3760                 if (info->dev)
3761                         info->dev->dev.archdata.iommu = NULL;
3762
3763                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3764
3765                 iommu_disable_dev_iotlb(info);
3766                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3767                 iommu_detach_dev(iommu, info->bus, info->devfn);
3768                 iommu_detach_dependent_devices(iommu, info->dev);
3769
3770                 /* clear this iommu in iommu_bmp, update iommu count
3771                  * and capabilities
3772                  */
3773                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3774                 if (test_and_clear_bit(iommu->seq_id,
3775                                        &domain->iommu_bmp)) {
3776                         domain->iommu_count--;
3777                         domain_update_iommu_cap(domain);
3778                 }
3779                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3780
3781                 free_devinfo_mem(info);
3782                 spin_lock_irqsave(&device_domain_lock, flags1);
3783         }
3784         spin_unlock_irqrestore(&device_domain_lock, flags1);
3785 }
3786
3787 /* domain id for virtual machine, it won't be set in context */
3788 static unsigned long vm_domid;
3789
3790 static struct dmar_domain *iommu_alloc_vm_domain(void)
3791 {
3792         struct dmar_domain *domain;
3793
3794         domain = alloc_domain_mem();
3795         if (!domain)
3796                 return NULL;
3797
3798         domain->id = vm_domid++;
3799         domain->nid = -1;
3800         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3801         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3802
3803         return domain;
3804 }
3805
3806 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3807 {
3808         int adjust_width;
3809
3810         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3811         spin_lock_init(&domain->iommu_lock);
3812
3813         domain_reserve_special_ranges(domain);
3814
3815         /* calculate AGAW */
3816         domain->gaw = guest_width;
3817         adjust_width = guestwidth_to_adjustwidth(guest_width);
3818         domain->agaw = width_to_agaw(adjust_width);
3819
3820         INIT_LIST_HEAD(&domain->devices);
3821
3822         domain->iommu_count = 0;
3823         domain->iommu_coherency = 0;
3824         domain->iommu_snooping = 0;
3825         domain->iommu_superpage = 0;
3826         domain->max_addr = 0;
3827         domain->nid = -1;
3828
3829         /* always allocate the top pgd */
3830         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3831         if (!domain->pgd)
3832                 return -ENOMEM;
3833         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3834         return 0;
3835 }
3836
3837 static void iommu_free_vm_domain(struct dmar_domain *domain)
3838 {
3839         unsigned long flags;
3840         struct dmar_drhd_unit *drhd;
3841         struct intel_iommu *iommu;
3842         unsigned long i;
3843         unsigned long ndomains;
3844
3845         for_each_drhd_unit(drhd) {
3846                 if (drhd->ignored)
3847                         continue;
3848                 iommu = drhd->iommu;
3849
3850                 ndomains = cap_ndoms(iommu->cap);
3851                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3852                         if (iommu->domains[i] == domain) {
3853                                 spin_lock_irqsave(&iommu->lock, flags);
3854                                 clear_bit(i, iommu->domain_ids);
3855                                 iommu->domains[i] = NULL;
3856                                 spin_unlock_irqrestore(&iommu->lock, flags);
3857                                 break;
3858                         }
3859                 }
3860         }
3861 }
3862
3863 static void vm_domain_exit(struct dmar_domain *domain)
3864 {
3865         /* Domain 0 is reserved, so dont process it */
3866         if (!domain)
3867                 return;
3868
3869         vm_domain_remove_all_dev_info(domain);
3870         /* destroy iovas */
3871         put_iova_domain(&domain->iovad);
3872
3873         /* clear ptes */
3874         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3875
3876         /* free page tables */
3877         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3878
3879         iommu_free_vm_domain(domain);
3880         free_domain_mem(domain);
3881 }
3882
3883 static int intel_iommu_domain_init(struct iommu_domain *domain)
3884 {
3885         struct dmar_domain *dmar_domain;
3886
3887         dmar_domain = iommu_alloc_vm_domain();
3888         if (!dmar_domain) {
3889                 printk(KERN_ERR
3890                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3891                 return -ENOMEM;
3892         }
3893         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3894                 printk(KERN_ERR
3895                         "intel_iommu_domain_init() failed\n");
3896                 vm_domain_exit(dmar_domain);
3897                 return -ENOMEM;
3898         }
3899         domain_update_iommu_cap(dmar_domain);
3900         domain->priv = dmar_domain;
3901
3902         return 0;
3903 }
3904
3905 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3906 {
3907         struct dmar_domain *dmar_domain = domain->priv;
3908
3909         domain->priv = NULL;
3910         vm_domain_exit(dmar_domain);
3911 }
3912
3913 static int intel_iommu_attach_device(struct iommu_domain *domain,
3914                                      struct device *dev)
3915 {
3916         struct dmar_domain *dmar_domain = domain->priv;
3917         struct pci_dev *pdev = to_pci_dev(dev);
3918         struct intel_iommu *iommu;
3919         int addr_width;
3920
3921         /* normally pdev is not mapped */
3922         if (unlikely(domain_context_mapped(pdev))) {
3923                 struct dmar_domain *old_domain;
3924
3925                 old_domain = find_domain(pdev);
3926                 if (old_domain) {
3927                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3928                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3929                                 domain_remove_one_dev_info(old_domain, pdev);
3930                         else
3931                                 domain_remove_dev_info(old_domain);
3932                 }
3933         }
3934
3935         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3936                                 pdev->devfn);
3937         if (!iommu)
3938                 return -ENODEV;
3939
3940         /* check if this iommu agaw is sufficient for max mapped address */
3941         addr_width = agaw_to_width(iommu->agaw);
3942         if (addr_width > cap_mgaw(iommu->cap))
3943                 addr_width = cap_mgaw(iommu->cap);
3944
3945         if (dmar_domain->max_addr > (1LL << addr_width)) {
3946                 printk(KERN_ERR "%s: iommu width (%d) is not "
3947                        "sufficient for the mapped address (%llx)\n",
3948                        __func__, addr_width, dmar_domain->max_addr);
3949                 return -EFAULT;
3950         }
3951         dmar_domain->gaw = addr_width;
3952
3953         /*
3954          * Knock out extra levels of page tables if necessary
3955          */
3956         while (iommu->agaw < dmar_domain->agaw) {
3957                 struct dma_pte *pte;
3958
3959                 pte = dmar_domain->pgd;
3960                 if (dma_pte_present(pte)) {
3961                         dmar_domain->pgd = (struct dma_pte *)
3962                                 phys_to_virt(dma_pte_addr(pte));
3963                         free_pgtable_page(pte);
3964                 }
3965                 dmar_domain->agaw--;
3966         }
3967
3968         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3969 }
3970
3971 static void intel_iommu_detach_device(struct iommu_domain *domain,
3972                                       struct device *dev)
3973 {
3974         struct dmar_domain *dmar_domain = domain->priv;
3975         struct pci_dev *pdev = to_pci_dev(dev);
3976
3977         domain_remove_one_dev_info(dmar_domain, pdev);
3978 }
3979
3980 static int intel_iommu_map(struct iommu_domain *domain,
3981                            unsigned long iova, phys_addr_t hpa,
3982                            int gfp_order, int iommu_prot)
3983 {
3984         struct dmar_domain *dmar_domain = domain->priv;
3985         u64 max_addr;
3986         int prot = 0;
3987         size_t size;
3988         int ret;
3989
3990         if (iommu_prot & IOMMU_READ)
3991                 prot |= DMA_PTE_READ;
3992         if (iommu_prot & IOMMU_WRITE)
3993                 prot |= DMA_PTE_WRITE;
3994         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3995                 prot |= DMA_PTE_SNP;
3996
3997         size     = PAGE_SIZE << gfp_order;
3998         max_addr = iova + size;
3999         if (dmar_domain->max_addr < max_addr) {
4000                 u64 end;
4001
4002                 /* check if minimum agaw is sufficient for mapped address */
4003                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4004                 if (end < max_addr) {
4005                         printk(KERN_ERR "%s: iommu width (%d) is not "
4006                                "sufficient for the mapped address (%llx)\n",
4007                                __func__, dmar_domain->gaw, max_addr);
4008                         return -EFAULT;
4009                 }
4010                 dmar_domain->max_addr = max_addr;
4011         }
4012         /* Round up size to next multiple of PAGE_SIZE, if it and
4013            the low bits of hpa would take us onto the next page */
4014         size = aligned_nrpages(hpa, size);
4015         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4016                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4017         return ret;
4018 }
4019
4020 static int intel_iommu_unmap(struct iommu_domain *domain,
4021                              unsigned long iova, int gfp_order)
4022 {
4023         struct dmar_domain *dmar_domain = domain->priv;
4024         size_t size = PAGE_SIZE << gfp_order;
4025         int order;
4026
4027         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4028                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4029
4030         if (dmar_domain->max_addr == iova + size)
4031                 dmar_domain->max_addr = iova;
4032
4033         return order;
4034 }
4035
4036 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4037                                             unsigned long iova)
4038 {
4039         struct dmar_domain *dmar_domain = domain->priv;
4040         struct dma_pte *pte;
4041         u64 phys = 0;
4042
4043         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4044         if (pte)
4045                 phys = dma_pte_addr(pte);
4046
4047         return phys;
4048 }
4049
4050 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4051                                       unsigned long cap)
4052 {
4053         struct dmar_domain *dmar_domain = domain->priv;
4054
4055         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4056                 return dmar_domain->iommu_snooping;
4057         if (cap == IOMMU_CAP_INTR_REMAP)
4058                 return intr_remapping_enabled;
4059
4060         return 0;
4061 }
4062
4063 static struct iommu_ops intel_iommu_ops = {
4064         .domain_init    = intel_iommu_domain_init,
4065         .domain_destroy = intel_iommu_domain_destroy,
4066         .attach_dev     = intel_iommu_attach_device,
4067         .detach_dev     = intel_iommu_detach_device,
4068         .map            = intel_iommu_map,
4069         .unmap          = intel_iommu_unmap,
4070         .iova_to_phys   = intel_iommu_iova_to_phys,
4071         .domain_has_cap = intel_iommu_domain_has_cap,
4072 };
4073
4074 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4075 {
4076         /*
4077          * Mobile 4 Series Chipset neglects to set RWBF capability,
4078          * but needs it:
4079          */
4080         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4081         rwbf_quirk = 1;
4082
4083         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4084         if (dev->revision == 0x07) {
4085                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4086                 dmar_map_gfx = 0;
4087         }
4088 }
4089
4090 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4091
4092 #define GGC 0x52
4093 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4094 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4095 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4096 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4097 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4098 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4099 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4100 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4101
4102 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4103 {
4104         unsigned short ggc;
4105
4106         if (pci_read_config_word(dev, GGC, &ggc))
4107                 return;
4108
4109         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4110                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4111                 dmar_map_gfx = 0;
4112         } else if (dmar_map_gfx) {
4113                 /* we have to ensure the gfx device is idle before we flush */
4114                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4115                 intel_iommu_strict = 1;
4116        }
4117 }
4118 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4119 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4120 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4121 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4122
4123 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4124    ISOCH DMAR unit for the Azalia sound device, but not give it any
4125    TLB entries, which causes it to deadlock. Check for that.  We do
4126    this in a function called from init_dmars(), instead of in a PCI
4127    quirk, because we don't want to print the obnoxious "BIOS broken"
4128    message if VT-d is actually disabled.
4129 */
4130 static void __init check_tylersburg_isoch(void)
4131 {
4132         struct pci_dev *pdev;
4133         uint32_t vtisochctrl;
4134
4135         /* If there's no Azalia in the system anyway, forget it. */
4136         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4137         if (!pdev)
4138                 return;
4139         pci_dev_put(pdev);
4140
4141         /* System Management Registers. Might be hidden, in which case
4142            we can't do the sanity check. But that's OK, because the
4143            known-broken BIOSes _don't_ actually hide it, so far. */
4144         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4145         if (!pdev)
4146                 return;
4147
4148         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4149                 pci_dev_put(pdev);
4150                 return;
4151         }
4152
4153         pci_dev_put(pdev);
4154
4155         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4156         if (vtisochctrl & 1)
4157                 return;
4158
4159         /* Drop all bits other than the number of TLB entries */
4160         vtisochctrl &= 0x1c;
4161
4162         /* If we have the recommended number of TLB entries (16), fine. */
4163         if (vtisochctrl == 0x10)
4164                 return;
4165
4166         /* Zero TLB entries? You get to ride the short bus to school. */
4167         if (!vtisochctrl) {
4168                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4169                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4170                      dmi_get_system_info(DMI_BIOS_VENDOR),
4171                      dmi_get_system_info(DMI_BIOS_VERSION),
4172                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4173                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4174                 return;
4175         }
4176         
4177         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4178                vtisochctrl);
4179 }