]> Pileus Git - ~andy/linux/blob - drivers/iommu/intel-iommu.c
iommu/vt-d, trivial: print correct domain id of static identity domain
[~andy/linux] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #include "irq_remapping.h"
50 #include "pci.h"
51
52 #define ROOT_SIZE               VTD_PAGE_SIZE
53 #define CONTEXT_SIZE            VTD_PAGE_SIZE
54
55 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
64
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67
68 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
74                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76
77 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
78 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
79 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
80
81 /* page table handling */
82 #define LEVEL_STRIDE            (9)
83 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
84
85 /*
86  * This bitmap is used to advertise the page sizes our hardware support
87  * to the IOMMU core, which will then use this information to split
88  * physically contiguous memory regions it is mapping into page sizes
89  * that we support.
90  *
91  * Traditionally the IOMMU core just handed us the mappings directly,
92  * after making sure the size is an order of a 4KiB page and that the
93  * mapping has natural alignment.
94  *
95  * To retain this behavior, we currently advertise that we support
96  * all page sizes that are an order of 4KiB.
97  *
98  * If at some point we'd like to utilize the IOMMU core's new behavior,
99  * we could change this to advertise the real page sizes we support.
100  */
101 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
102
103 static inline int agaw_to_level(int agaw)
104 {
105         return agaw + 2;
106 }
107
108 static inline int agaw_to_width(int agaw)
109 {
110         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
111 }
112
113 static inline int width_to_agaw(int width)
114 {
115         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
116 }
117
118 static inline unsigned int level_to_offset_bits(int level)
119 {
120         return (level - 1) * LEVEL_STRIDE;
121 }
122
123 static inline int pfn_level_offset(unsigned long pfn, int level)
124 {
125         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
126 }
127
128 static inline unsigned long level_mask(int level)
129 {
130         return -1UL << level_to_offset_bits(level);
131 }
132
133 static inline unsigned long level_size(int level)
134 {
135         return 1UL << level_to_offset_bits(level);
136 }
137
138 static inline unsigned long align_to_level(unsigned long pfn, int level)
139 {
140         return (pfn + level_size(level) - 1) & level_mask(level);
141 }
142
143 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
144 {
145         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
146 }
147
148 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
149    are never going to work. */
150 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
151 {
152         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
153 }
154
155 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
156 {
157         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 }
159 static inline unsigned long page_to_dma_pfn(struct page *pg)
160 {
161         return mm_to_dma_pfn(page_to_pfn(pg));
162 }
163 static inline unsigned long virt_to_dma_pfn(void *p)
164 {
165         return page_to_dma_pfn(virt_to_page(p));
166 }
167
168 /* global iommu list, set NULL for ignored DMAR units */
169 static struct intel_iommu **g_iommus;
170
171 static void __init check_tylersburg_isoch(void);
172 static int rwbf_quirk;
173
174 /*
175  * set to 1 to panic kernel if can't successfully enable VT-d
176  * (used when kernel is launched w/ TXT)
177  */
178 static int force_on = 0;
179
180 /*
181  * 0: Present
182  * 1-11: Reserved
183  * 12-63: Context Ptr (12 - (haw-1))
184  * 64-127: Reserved
185  */
186 struct root_entry {
187         u64     val;
188         u64     rsvd1;
189 };
190 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
191 static inline bool root_present(struct root_entry *root)
192 {
193         return (root->val & 1);
194 }
195 static inline void set_root_present(struct root_entry *root)
196 {
197         root->val |= 1;
198 }
199 static inline void set_root_value(struct root_entry *root, unsigned long value)
200 {
201         root->val |= value & VTD_PAGE_MASK;
202 }
203
204 static inline struct context_entry *
205 get_context_addr_from_root(struct root_entry *root)
206 {
207         return (struct context_entry *)
208                 (root_present(root)?phys_to_virt(
209                 root->val & VTD_PAGE_MASK) :
210                 NULL);
211 }
212
213 /*
214  * low 64 bits:
215  * 0: present
216  * 1: fault processing disable
217  * 2-3: translation type
218  * 12-63: address space root
219  * high 64 bits:
220  * 0-2: address width
221  * 3-6: aval
222  * 8-23: domain id
223  */
224 struct context_entry {
225         u64 lo;
226         u64 hi;
227 };
228
229 static inline bool context_present(struct context_entry *context)
230 {
231         return (context->lo & 1);
232 }
233 static inline void context_set_present(struct context_entry *context)
234 {
235         context->lo |= 1;
236 }
237
238 static inline void context_set_fault_enable(struct context_entry *context)
239 {
240         context->lo &= (((u64)-1) << 2) | 1;
241 }
242
243 static inline void context_set_translation_type(struct context_entry *context,
244                                                 unsigned long value)
245 {
246         context->lo &= (((u64)-1) << 4) | 3;
247         context->lo |= (value & 3) << 2;
248 }
249
250 static inline void context_set_address_root(struct context_entry *context,
251                                             unsigned long value)
252 {
253         context->lo |= value & VTD_PAGE_MASK;
254 }
255
256 static inline void context_set_address_width(struct context_entry *context,
257                                              unsigned long value)
258 {
259         context->hi |= value & 7;
260 }
261
262 static inline void context_set_domain_id(struct context_entry *context,
263                                          unsigned long value)
264 {
265         context->hi |= (value & ((1 << 16) - 1)) << 8;
266 }
267
268 static inline void context_clear_entry(struct context_entry *context)
269 {
270         context->lo = 0;
271         context->hi = 0;
272 }
273
274 /*
275  * 0: readable
276  * 1: writable
277  * 2-6: reserved
278  * 7: super page
279  * 8-10: available
280  * 11: snoop behavior
281  * 12-63: Host physcial address
282  */
283 struct dma_pte {
284         u64 val;
285 };
286
287 static inline void dma_clear_pte(struct dma_pte *pte)
288 {
289         pte->val = 0;
290 }
291
292 static inline void dma_set_pte_readable(struct dma_pte *pte)
293 {
294         pte->val |= DMA_PTE_READ;
295 }
296
297 static inline void dma_set_pte_writable(struct dma_pte *pte)
298 {
299         pte->val |= DMA_PTE_WRITE;
300 }
301
302 static inline void dma_set_pte_snp(struct dma_pte *pte)
303 {
304         pte->val |= DMA_PTE_SNP;
305 }
306
307 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
308 {
309         pte->val = (pte->val & ~3) | (prot & 3);
310 }
311
312 static inline u64 dma_pte_addr(struct dma_pte *pte)
313 {
314 #ifdef CONFIG_64BIT
315         return pte->val & VTD_PAGE_MASK;
316 #else
317         /* Must have a full atomic 64-bit read */
318         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
319 #endif
320 }
321
322 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
323 {
324         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
325 }
326
327 static inline bool dma_pte_present(struct dma_pte *pte)
328 {
329         return (pte->val & 3) != 0;
330 }
331
332 static inline bool dma_pte_superpage(struct dma_pte *pte)
333 {
334         return (pte->val & (1 << 7));
335 }
336
337 static inline int first_pte_in_page(struct dma_pte *pte)
338 {
339         return !((unsigned long)pte & ~VTD_PAGE_MASK);
340 }
341
342 /*
343  * This domain is a statically identity mapping domain.
344  *      1. This domain creats a static 1:1 mapping to all usable memory.
345  *      2. It maps to each iommu if successful.
346  *      3. Each iommu mapps to this domain if successful.
347  */
348 static struct dmar_domain *si_domain;
349 static int hw_pass_through = 1;
350
351 /* devices under the same p2p bridge are owned in one domain */
352 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
353
354 /* domain represents a virtual machine, more than one devices
355  * across iommus may be owned in one domain, e.g. kvm guest.
356  */
357 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
358
359 /* si_domain contains mulitple devices */
360 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
361
362 /* define the limit of IOMMUs supported in each domain */
363 #ifdef  CONFIG_X86
364 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
365 #else
366 # define        IOMMU_UNITS_SUPPORTED   64
367 #endif
368
369 struct dmar_domain {
370         int     id;                     /* domain id */
371         int     nid;                    /* node id */
372         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
373                                         /* bitmap of iommus this domain uses*/
374
375         struct list_head devices;       /* all devices' list */
376         struct iova_domain iovad;       /* iova's that belong to this domain */
377
378         struct dma_pte  *pgd;           /* virtual address */
379         int             gaw;            /* max guest address width */
380
381         /* adjusted guest address width, 0 is level 2 30-bit */
382         int             agaw;
383
384         int             flags;          /* flags to find out type of domain */
385
386         int             iommu_coherency;/* indicate coherency of iommu access */
387         int             iommu_snooping; /* indicate snooping control feature*/
388         int             iommu_count;    /* reference count of iommu */
389         int             iommu_superpage;/* Level of superpages supported:
390                                            0 == 4KiB (no superpages), 1 == 2MiB,
391                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
392         spinlock_t      iommu_lock;     /* protect iommu set in domain */
393         u64             max_addr;       /* maximum mapped address */
394 };
395
396 /* PCI domain-device relationship */
397 struct device_domain_info {
398         struct list_head link;  /* link to domain siblings */
399         struct list_head global; /* link to global list */
400         int segment;            /* PCI domain */
401         u8 bus;                 /* PCI bus number */
402         u8 devfn;               /* PCI devfn number */
403         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
404         struct intel_iommu *iommu; /* IOMMU used by this device */
405         struct dmar_domain *domain; /* pointer to domain */
406 };
407
408 static void flush_unmaps_timeout(unsigned long data);
409
410 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
411
412 #define HIGH_WATER_MARK 250
413 struct deferred_flush_tables {
414         int next;
415         struct iova *iova[HIGH_WATER_MARK];
416         struct dmar_domain *domain[HIGH_WATER_MARK];
417 };
418
419 static struct deferred_flush_tables *deferred_flush;
420
421 /* bitmap for indexing intel_iommus */
422 static int g_num_of_iommus;
423
424 static DEFINE_SPINLOCK(async_umap_flush_lock);
425 static LIST_HEAD(unmaps_to_do);
426
427 static int timer_on;
428 static long list_size;
429
430 static void domain_remove_dev_info(struct dmar_domain *domain);
431
432 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
433 int dmar_disabled = 0;
434 #else
435 int dmar_disabled = 1;
436 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
437
438 int intel_iommu_enabled = 0;
439 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
440
441 static int dmar_map_gfx = 1;
442 static int dmar_forcedac;
443 static int intel_iommu_strict;
444 static int intel_iommu_superpage = 1;
445
446 int intel_iommu_gfx_mapped;
447 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
448
449 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
450 static DEFINE_SPINLOCK(device_domain_lock);
451 static LIST_HEAD(device_domain_list);
452
453 static struct iommu_ops intel_iommu_ops;
454
455 static int __init intel_iommu_setup(char *str)
456 {
457         if (!str)
458                 return -EINVAL;
459         while (*str) {
460                 if (!strncmp(str, "on", 2)) {
461                         dmar_disabled = 0;
462                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
463                 } else if (!strncmp(str, "off", 3)) {
464                         dmar_disabled = 1;
465                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
466                 } else if (!strncmp(str, "igfx_off", 8)) {
467                         dmar_map_gfx = 0;
468                         printk(KERN_INFO
469                                 "Intel-IOMMU: disable GFX device mapping\n");
470                 } else if (!strncmp(str, "forcedac", 8)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
473                         dmar_forcedac = 1;
474                 } else if (!strncmp(str, "strict", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable batched IOTLB flush\n");
477                         intel_iommu_strict = 1;
478                 } else if (!strncmp(str, "sp_off", 6)) {
479                         printk(KERN_INFO
480                                 "Intel-IOMMU: disable supported super page\n");
481                         intel_iommu_superpage = 0;
482                 }
483
484                 str += strcspn(str, ",");
485                 while (*str == ',')
486                         str++;
487         }
488         return 0;
489 }
490 __setup("intel_iommu=", intel_iommu_setup);
491
492 static struct kmem_cache *iommu_domain_cache;
493 static struct kmem_cache *iommu_devinfo_cache;
494 static struct kmem_cache *iommu_iova_cache;
495
496 static inline void *alloc_pgtable_page(int node)
497 {
498         struct page *page;
499         void *vaddr = NULL;
500
501         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
502         if (page)
503                 vaddr = page_address(page);
504         return vaddr;
505 }
506
507 static inline void free_pgtable_page(void *vaddr)
508 {
509         free_page((unsigned long)vaddr);
510 }
511
512 static inline void *alloc_domain_mem(void)
513 {
514         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
515 }
516
517 static void free_domain_mem(void *vaddr)
518 {
519         kmem_cache_free(iommu_domain_cache, vaddr);
520 }
521
522 static inline void * alloc_devinfo_mem(void)
523 {
524         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
525 }
526
527 static inline void free_devinfo_mem(void *vaddr)
528 {
529         kmem_cache_free(iommu_devinfo_cache, vaddr);
530 }
531
532 struct iova *alloc_iova_mem(void)
533 {
534         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
535 }
536
537 void free_iova_mem(struct iova *iova)
538 {
539         kmem_cache_free(iommu_iova_cache, iova);
540 }
541
542
543 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
544 {
545         unsigned long sagaw;
546         int agaw = -1;
547
548         sagaw = cap_sagaw(iommu->cap);
549         for (agaw = width_to_agaw(max_gaw);
550              agaw >= 0; agaw--) {
551                 if (test_bit(agaw, &sagaw))
552                         break;
553         }
554
555         return agaw;
556 }
557
558 /*
559  * Calculate max SAGAW for each iommu.
560  */
561 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
562 {
563         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
564 }
565
566 /*
567  * calculate agaw for each iommu.
568  * "SAGAW" may be different across iommus, use a default agaw, and
569  * get a supported less agaw for iommus that don't support the default agaw.
570  */
571 int iommu_calculate_agaw(struct intel_iommu *iommu)
572 {
573         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
574 }
575
576 /* This functionin only returns single iommu in a domain */
577 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
578 {
579         int iommu_id;
580
581         /* si_domain and vm domain should not get here. */
582         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
583         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
584
585         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
586         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
587                 return NULL;
588
589         return g_iommus[iommu_id];
590 }
591
592 static void domain_update_iommu_coherency(struct dmar_domain *domain)
593 {
594         int i;
595
596         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
597
598         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
599
600         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
601                 if (!ecap_coherent(g_iommus[i]->ecap)) {
602                         domain->iommu_coherency = 0;
603                         break;
604                 }
605         }
606 }
607
608 static void domain_update_iommu_snooping(struct dmar_domain *domain)
609 {
610         int i;
611
612         domain->iommu_snooping = 1;
613
614         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
615                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
616                         domain->iommu_snooping = 0;
617                         break;
618                 }
619         }
620 }
621
622 static void domain_update_iommu_superpage(struct dmar_domain *domain)
623 {
624         struct dmar_drhd_unit *drhd;
625         struct intel_iommu *iommu = NULL;
626         int mask = 0xf;
627
628         if (!intel_iommu_superpage) {
629                 domain->iommu_superpage = 0;
630                 return;
631         }
632
633         /* set iommu_superpage to the smallest common denominator */
634         for_each_active_iommu(iommu, drhd) {
635                 mask &= cap_super_page_val(iommu->cap);
636                 if (!mask) {
637                         break;
638                 }
639         }
640         domain->iommu_superpage = fls(mask);
641 }
642
643 /* Some capabilities may be different across iommus */
644 static void domain_update_iommu_cap(struct dmar_domain *domain)
645 {
646         domain_update_iommu_coherency(domain);
647         domain_update_iommu_snooping(domain);
648         domain_update_iommu_superpage(domain);
649 }
650
651 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
652 {
653         struct dmar_drhd_unit *drhd = NULL;
654         int i;
655
656         for_each_drhd_unit(drhd) {
657                 if (drhd->ignored)
658                         continue;
659                 if (segment != drhd->segment)
660                         continue;
661
662                 for (i = 0; i < drhd->devices_cnt; i++) {
663                         if (drhd->devices[i] &&
664                             drhd->devices[i]->bus->number == bus &&
665                             drhd->devices[i]->devfn == devfn)
666                                 return drhd->iommu;
667                         if (drhd->devices[i] &&
668                             drhd->devices[i]->subordinate &&
669                             drhd->devices[i]->subordinate->number <= bus &&
670                             drhd->devices[i]->subordinate->busn_res.end >= bus)
671                                 return drhd->iommu;
672                 }
673
674                 if (drhd->include_all)
675                         return drhd->iommu;
676         }
677
678         return NULL;
679 }
680
681 static void domain_flush_cache(struct dmar_domain *domain,
682                                void *addr, int size)
683 {
684         if (!domain->iommu_coherency)
685                 clflush_cache_range(addr, size);
686 }
687
688 /* Gets context entry for a given bus and devfn */
689 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
690                 u8 bus, u8 devfn)
691 {
692         struct root_entry *root;
693         struct context_entry *context;
694         unsigned long phy_addr;
695         unsigned long flags;
696
697         spin_lock_irqsave(&iommu->lock, flags);
698         root = &iommu->root_entry[bus];
699         context = get_context_addr_from_root(root);
700         if (!context) {
701                 context = (struct context_entry *)
702                                 alloc_pgtable_page(iommu->node);
703                 if (!context) {
704                         spin_unlock_irqrestore(&iommu->lock, flags);
705                         return NULL;
706                 }
707                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
708                 phy_addr = virt_to_phys((void *)context);
709                 set_root_value(root, phy_addr);
710                 set_root_present(root);
711                 __iommu_flush_cache(iommu, root, sizeof(*root));
712         }
713         spin_unlock_irqrestore(&iommu->lock, flags);
714         return &context[devfn];
715 }
716
717 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
718 {
719         struct root_entry *root;
720         struct context_entry *context;
721         int ret;
722         unsigned long flags;
723
724         spin_lock_irqsave(&iommu->lock, flags);
725         root = &iommu->root_entry[bus];
726         context = get_context_addr_from_root(root);
727         if (!context) {
728                 ret = 0;
729                 goto out;
730         }
731         ret = context_present(&context[devfn]);
732 out:
733         spin_unlock_irqrestore(&iommu->lock, flags);
734         return ret;
735 }
736
737 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
738 {
739         struct root_entry *root;
740         struct context_entry *context;
741         unsigned long flags;
742
743         spin_lock_irqsave(&iommu->lock, flags);
744         root = &iommu->root_entry[bus];
745         context = get_context_addr_from_root(root);
746         if (context) {
747                 context_clear_entry(&context[devfn]);
748                 __iommu_flush_cache(iommu, &context[devfn], \
749                         sizeof(*context));
750         }
751         spin_unlock_irqrestore(&iommu->lock, flags);
752 }
753
754 static void free_context_table(struct intel_iommu *iommu)
755 {
756         struct root_entry *root;
757         int i;
758         unsigned long flags;
759         struct context_entry *context;
760
761         spin_lock_irqsave(&iommu->lock, flags);
762         if (!iommu->root_entry) {
763                 goto out;
764         }
765         for (i = 0; i < ROOT_ENTRY_NR; i++) {
766                 root = &iommu->root_entry[i];
767                 context = get_context_addr_from_root(root);
768                 if (context)
769                         free_pgtable_page(context);
770         }
771         free_pgtable_page(iommu->root_entry);
772         iommu->root_entry = NULL;
773 out:
774         spin_unlock_irqrestore(&iommu->lock, flags);
775 }
776
777 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
778                                       unsigned long pfn, int target_level)
779 {
780         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
781         struct dma_pte *parent, *pte = NULL;
782         int level = agaw_to_level(domain->agaw);
783         int offset;
784
785         BUG_ON(!domain->pgd);
786
787         if (addr_width < BITS_PER_LONG && pfn >> addr_width)
788                 /* Address beyond IOMMU's addressing capabilities. */
789                 return NULL;
790
791         parent = domain->pgd;
792
793         while (level > 0) {
794                 void *tmp_page;
795
796                 offset = pfn_level_offset(pfn, level);
797                 pte = &parent[offset];
798                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
799                         break;
800                 if (level == target_level)
801                         break;
802
803                 if (!dma_pte_present(pte)) {
804                         uint64_t pteval;
805
806                         tmp_page = alloc_pgtable_page(domain->nid);
807
808                         if (!tmp_page)
809                                 return NULL;
810
811                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
812                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
813                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
814                                 /* Someone else set it while we were thinking; use theirs. */
815                                 free_pgtable_page(tmp_page);
816                         } else {
817                                 dma_pte_addr(pte);
818                                 domain_flush_cache(domain, pte, sizeof(*pte));
819                         }
820                 }
821                 parent = phys_to_virt(dma_pte_addr(pte));
822                 level--;
823         }
824
825         return pte;
826 }
827
828
829 /* return address's pte at specific level */
830 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
831                                          unsigned long pfn,
832                                          int level, int *large_page)
833 {
834         struct dma_pte *parent, *pte = NULL;
835         int total = agaw_to_level(domain->agaw);
836         int offset;
837
838         parent = domain->pgd;
839         while (level <= total) {
840                 offset = pfn_level_offset(pfn, total);
841                 pte = &parent[offset];
842                 if (level == total)
843                         return pte;
844
845                 if (!dma_pte_present(pte)) {
846                         *large_page = total;
847                         break;
848                 }
849
850                 if (pte->val & DMA_PTE_LARGE_PAGE) {
851                         *large_page = total;
852                         return pte;
853                 }
854
855                 parent = phys_to_virt(dma_pte_addr(pte));
856                 total--;
857         }
858         return NULL;
859 }
860
861 /* clear last level pte, a tlb flush should be followed */
862 static int dma_pte_clear_range(struct dmar_domain *domain,
863                                 unsigned long start_pfn,
864                                 unsigned long last_pfn)
865 {
866         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
867         unsigned int large_page = 1;
868         struct dma_pte *first_pte, *pte;
869
870         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
871         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
872         BUG_ON(start_pfn > last_pfn);
873
874         /* we don't need lock here; nobody else touches the iova range */
875         do {
876                 large_page = 1;
877                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
878                 if (!pte) {
879                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
880                         continue;
881                 }
882                 do {
883                         dma_clear_pte(pte);
884                         start_pfn += lvl_to_nr_pages(large_page);
885                         pte++;
886                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
887
888                 domain_flush_cache(domain, first_pte,
889                                    (void *)pte - (void *)first_pte);
890
891         } while (start_pfn && start_pfn <= last_pfn);
892
893         return min_t(int, (large_page - 1) * 9, MAX_AGAW_PFN_WIDTH);
894 }
895
896 static void dma_pte_free_level(struct dmar_domain *domain, int level,
897                                struct dma_pte *pte, unsigned long pfn,
898                                unsigned long start_pfn, unsigned long last_pfn)
899 {
900         pfn = max(start_pfn, pfn);
901         pte = &pte[pfn_level_offset(pfn, level)];
902
903         do {
904                 unsigned long level_pfn;
905                 struct dma_pte *level_pte;
906
907                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
908                         goto next;
909
910                 level_pfn = pfn & level_mask(level - 1);
911                 level_pte = phys_to_virt(dma_pte_addr(pte));
912
913                 if (level > 2)
914                         dma_pte_free_level(domain, level - 1, level_pte,
915                                            level_pfn, start_pfn, last_pfn);
916
917                 /* If range covers entire pagetable, free it */
918                 if (!(start_pfn > level_pfn ||
919                       last_pfn < level_pfn + level_size(level))) {
920                         dma_clear_pte(pte);
921                         domain_flush_cache(domain, pte, sizeof(*pte));
922                         free_pgtable_page(level_pte);
923                 }
924 next:
925                 pfn += level_size(level);
926         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
927 }
928
929 /* free page table pages. last level pte should already be cleared */
930 static void dma_pte_free_pagetable(struct dmar_domain *domain,
931                                    unsigned long start_pfn,
932                                    unsigned long last_pfn)
933 {
934         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
935
936         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
937         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
938         BUG_ON(start_pfn > last_pfn);
939
940         /* We don't need lock here; nobody else touches the iova range */
941         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
942                            domain->pgd, 0, start_pfn, last_pfn);
943
944         /* free pgd */
945         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
946                 free_pgtable_page(domain->pgd);
947                 domain->pgd = NULL;
948         }
949 }
950
951 /* iommu handling */
952 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
953 {
954         struct root_entry *root;
955         unsigned long flags;
956
957         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
958         if (!root)
959                 return -ENOMEM;
960
961         __iommu_flush_cache(iommu, root, ROOT_SIZE);
962
963         spin_lock_irqsave(&iommu->lock, flags);
964         iommu->root_entry = root;
965         spin_unlock_irqrestore(&iommu->lock, flags);
966
967         return 0;
968 }
969
970 static void iommu_set_root_entry(struct intel_iommu *iommu)
971 {
972         void *addr;
973         u32 sts;
974         unsigned long flag;
975
976         addr = iommu->root_entry;
977
978         raw_spin_lock_irqsave(&iommu->register_lock, flag);
979         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
980
981         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
982
983         /* Make sure hardware complete it */
984         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
985                       readl, (sts & DMA_GSTS_RTPS), sts);
986
987         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
988 }
989
990 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
991 {
992         u32 val;
993         unsigned long flag;
994
995         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
996                 return;
997
998         raw_spin_lock_irqsave(&iommu->register_lock, flag);
999         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1000
1001         /* Make sure hardware complete it */
1002         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1003                       readl, (!(val & DMA_GSTS_WBFS)), val);
1004
1005         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1006 }
1007
1008 /* return value determine if we need a write buffer flush */
1009 static void __iommu_flush_context(struct intel_iommu *iommu,
1010                                   u16 did, u16 source_id, u8 function_mask,
1011                                   u64 type)
1012 {
1013         u64 val = 0;
1014         unsigned long flag;
1015
1016         switch (type) {
1017         case DMA_CCMD_GLOBAL_INVL:
1018                 val = DMA_CCMD_GLOBAL_INVL;
1019                 break;
1020         case DMA_CCMD_DOMAIN_INVL:
1021                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1022                 break;
1023         case DMA_CCMD_DEVICE_INVL:
1024                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1025                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1026                 break;
1027         default:
1028                 BUG();
1029         }
1030         val |= DMA_CCMD_ICC;
1031
1032         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1033         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1034
1035         /* Make sure hardware complete it */
1036         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1037                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1038
1039         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1040 }
1041
1042 /* return value determine if we need a write buffer flush */
1043 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1044                                 u64 addr, unsigned int size_order, u64 type)
1045 {
1046         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1047         u64 val = 0, val_iva = 0;
1048         unsigned long flag;
1049
1050         switch (type) {
1051         case DMA_TLB_GLOBAL_FLUSH:
1052                 /* global flush doesn't need set IVA_REG */
1053                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1054                 break;
1055         case DMA_TLB_DSI_FLUSH:
1056                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1057                 break;
1058         case DMA_TLB_PSI_FLUSH:
1059                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1060                 /* Note: always flush non-leaf currently */
1061                 val_iva = size_order | addr;
1062                 break;
1063         default:
1064                 BUG();
1065         }
1066         /* Note: set drain read/write */
1067 #if 0
1068         /*
1069          * This is probably to be super secure.. Looks like we can
1070          * ignore it without any impact.
1071          */
1072         if (cap_read_drain(iommu->cap))
1073                 val |= DMA_TLB_READ_DRAIN;
1074 #endif
1075         if (cap_write_drain(iommu->cap))
1076                 val |= DMA_TLB_WRITE_DRAIN;
1077
1078         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079         /* Note: Only uses first TLB reg currently */
1080         if (val_iva)
1081                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1082         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1083
1084         /* Make sure hardware complete it */
1085         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1086                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1087
1088         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1089
1090         /* check IOTLB invalidation granularity */
1091         if (DMA_TLB_IAIG(val) == 0)
1092                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1093         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1094                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1095                         (unsigned long long)DMA_TLB_IIRG(type),
1096                         (unsigned long long)DMA_TLB_IAIG(val));
1097 }
1098
1099 static struct device_domain_info *iommu_support_dev_iotlb(
1100         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1101 {
1102         int found = 0;
1103         unsigned long flags;
1104         struct device_domain_info *info;
1105         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1106
1107         if (!ecap_dev_iotlb_support(iommu->ecap))
1108                 return NULL;
1109
1110         if (!iommu->qi)
1111                 return NULL;
1112
1113         spin_lock_irqsave(&device_domain_lock, flags);
1114         list_for_each_entry(info, &domain->devices, link)
1115                 if (info->bus == bus && info->devfn == devfn) {
1116                         found = 1;
1117                         break;
1118                 }
1119         spin_unlock_irqrestore(&device_domain_lock, flags);
1120
1121         if (!found || !info->dev)
1122                 return NULL;
1123
1124         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1125                 return NULL;
1126
1127         if (!dmar_find_matched_atsr_unit(info->dev))
1128                 return NULL;
1129
1130         info->iommu = iommu;
1131
1132         return info;
1133 }
1134
1135 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1136 {
1137         if (!info)
1138                 return;
1139
1140         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1141 }
1142
1143 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1144 {
1145         if (!info->dev || !pci_ats_enabled(info->dev))
1146                 return;
1147
1148         pci_disable_ats(info->dev);
1149 }
1150
1151 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1152                                   u64 addr, unsigned mask)
1153 {
1154         u16 sid, qdep;
1155         unsigned long flags;
1156         struct device_domain_info *info;
1157
1158         spin_lock_irqsave(&device_domain_lock, flags);
1159         list_for_each_entry(info, &domain->devices, link) {
1160                 if (!info->dev || !pci_ats_enabled(info->dev))
1161                         continue;
1162
1163                 sid = info->bus << 8 | info->devfn;
1164                 qdep = pci_ats_queue_depth(info->dev);
1165                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1166         }
1167         spin_unlock_irqrestore(&device_domain_lock, flags);
1168 }
1169
1170 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1171                                   unsigned long pfn, unsigned int pages, int map)
1172 {
1173         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1174         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1175
1176         BUG_ON(pages == 0);
1177
1178         /*
1179          * Fallback to domain selective flush if no PSI support or the size is
1180          * too big.
1181          * PSI requires page size to be 2 ^ x, and the base address is naturally
1182          * aligned to the size
1183          */
1184         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1185                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1186                                                 DMA_TLB_DSI_FLUSH);
1187         else
1188                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1189                                                 DMA_TLB_PSI_FLUSH);
1190
1191         /*
1192          * In caching mode, changes of pages from non-present to present require
1193          * flush. However, device IOTLB doesn't need to be flushed in this case.
1194          */
1195         if (!cap_caching_mode(iommu->cap) || !map)
1196                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1197 }
1198
1199 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1200 {
1201         u32 pmen;
1202         unsigned long flags;
1203
1204         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1205         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1206         pmen &= ~DMA_PMEN_EPM;
1207         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1208
1209         /* wait for the protected region status bit to clear */
1210         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1211                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1212
1213         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1214 }
1215
1216 static int iommu_enable_translation(struct intel_iommu *iommu)
1217 {
1218         u32 sts;
1219         unsigned long flags;
1220
1221         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1222         iommu->gcmd |= DMA_GCMD_TE;
1223         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1224
1225         /* Make sure hardware complete it */
1226         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1227                       readl, (sts & DMA_GSTS_TES), sts);
1228
1229         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1230         return 0;
1231 }
1232
1233 static int iommu_disable_translation(struct intel_iommu *iommu)
1234 {
1235         u32 sts;
1236         unsigned long flag;
1237
1238         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1239         iommu->gcmd &= ~DMA_GCMD_TE;
1240         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1241
1242         /* Make sure hardware complete it */
1243         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1244                       readl, (!(sts & DMA_GSTS_TES)), sts);
1245
1246         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1247         return 0;
1248 }
1249
1250
1251 static int iommu_init_domains(struct intel_iommu *iommu)
1252 {
1253         unsigned long ndomains;
1254         unsigned long nlongs;
1255
1256         ndomains = cap_ndoms(iommu->cap);
1257         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1258                  iommu->seq_id, ndomains);
1259         nlongs = BITS_TO_LONGS(ndomains);
1260
1261         spin_lock_init(&iommu->lock);
1262
1263         /* TBD: there might be 64K domains,
1264          * consider other allocation for future chip
1265          */
1266         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1267         if (!iommu->domain_ids) {
1268                 pr_err("IOMMU%d: allocating domain id array failed\n",
1269                        iommu->seq_id);
1270                 return -ENOMEM;
1271         }
1272         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1273                         GFP_KERNEL);
1274         if (!iommu->domains) {
1275                 pr_err("IOMMU%d: allocating domain array failed\n",
1276                        iommu->seq_id);
1277                 kfree(iommu->domain_ids);
1278                 iommu->domain_ids = NULL;
1279                 return -ENOMEM;
1280         }
1281
1282         /*
1283          * if Caching mode is set, then invalid translations are tagged
1284          * with domainid 0. Hence we need to pre-allocate it.
1285          */
1286         if (cap_caching_mode(iommu->cap))
1287                 set_bit(0, iommu->domain_ids);
1288         return 0;
1289 }
1290
1291
1292 static void domain_exit(struct dmar_domain *domain);
1293 static void vm_domain_exit(struct dmar_domain *domain);
1294
1295 void free_dmar_iommu(struct intel_iommu *iommu)
1296 {
1297         struct dmar_domain *domain;
1298         int i;
1299         unsigned long flags;
1300
1301         if ((iommu->domains) && (iommu->domain_ids)) {
1302                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1303                         domain = iommu->domains[i];
1304                         clear_bit(i, iommu->domain_ids);
1305
1306                         spin_lock_irqsave(&domain->iommu_lock, flags);
1307                         if (--domain->iommu_count == 0) {
1308                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1309                                         vm_domain_exit(domain);
1310                                 else
1311                                         domain_exit(domain);
1312                         }
1313                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1314                 }
1315         }
1316
1317         if (iommu->gcmd & DMA_GCMD_TE)
1318                 iommu_disable_translation(iommu);
1319
1320         if (iommu->irq) {
1321                 irq_set_handler_data(iommu->irq, NULL);
1322                 /* This will mask the irq */
1323                 free_irq(iommu->irq, iommu);
1324                 destroy_irq(iommu->irq);
1325         }
1326
1327         kfree(iommu->domains);
1328         kfree(iommu->domain_ids);
1329
1330         g_iommus[iommu->seq_id] = NULL;
1331
1332         /* if all iommus are freed, free g_iommus */
1333         for (i = 0; i < g_num_of_iommus; i++) {
1334                 if (g_iommus[i])
1335                         break;
1336         }
1337
1338         if (i == g_num_of_iommus)
1339                 kfree(g_iommus);
1340
1341         /* free context mapping */
1342         free_context_table(iommu);
1343 }
1344
1345 static struct dmar_domain *alloc_domain(void)
1346 {
1347         struct dmar_domain *domain;
1348
1349         domain = alloc_domain_mem();
1350         if (!domain)
1351                 return NULL;
1352
1353         domain->nid = -1;
1354         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1355         domain->flags = 0;
1356
1357         return domain;
1358 }
1359
1360 static int iommu_attach_domain(struct dmar_domain *domain,
1361                                struct intel_iommu *iommu)
1362 {
1363         int num;
1364         unsigned long ndomains;
1365         unsigned long flags;
1366
1367         ndomains = cap_ndoms(iommu->cap);
1368
1369         spin_lock_irqsave(&iommu->lock, flags);
1370
1371         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1372         if (num >= ndomains) {
1373                 spin_unlock_irqrestore(&iommu->lock, flags);
1374                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1375                 return -ENOMEM;
1376         }
1377
1378         domain->id = num;
1379         set_bit(num, iommu->domain_ids);
1380         set_bit(iommu->seq_id, domain->iommu_bmp);
1381         iommu->domains[num] = domain;
1382         spin_unlock_irqrestore(&iommu->lock, flags);
1383
1384         return 0;
1385 }
1386
1387 static void iommu_detach_domain(struct dmar_domain *domain,
1388                                 struct intel_iommu *iommu)
1389 {
1390         unsigned long flags;
1391         int num, ndomains;
1392         int found = 0;
1393
1394         spin_lock_irqsave(&iommu->lock, flags);
1395         ndomains = cap_ndoms(iommu->cap);
1396         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1397                 if (iommu->domains[num] == domain) {
1398                         found = 1;
1399                         break;
1400                 }
1401         }
1402
1403         if (found) {
1404                 clear_bit(num, iommu->domain_ids);
1405                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1406                 iommu->domains[num] = NULL;
1407         }
1408         spin_unlock_irqrestore(&iommu->lock, flags);
1409 }
1410
1411 static struct iova_domain reserved_iova_list;
1412 static struct lock_class_key reserved_rbtree_key;
1413
1414 static int dmar_init_reserved_ranges(void)
1415 {
1416         struct pci_dev *pdev = NULL;
1417         struct iova *iova;
1418         int i;
1419
1420         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1421
1422         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1423                 &reserved_rbtree_key);
1424
1425         /* IOAPIC ranges shouldn't be accessed by DMA */
1426         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1427                 IOVA_PFN(IOAPIC_RANGE_END));
1428         if (!iova) {
1429                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1430                 return -ENODEV;
1431         }
1432
1433         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1434         for_each_pci_dev(pdev) {
1435                 struct resource *r;
1436
1437                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1438                         r = &pdev->resource[i];
1439                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1440                                 continue;
1441                         iova = reserve_iova(&reserved_iova_list,
1442                                             IOVA_PFN(r->start),
1443                                             IOVA_PFN(r->end));
1444                         if (!iova) {
1445                                 printk(KERN_ERR "Reserve iova failed\n");
1446                                 return -ENODEV;
1447                         }
1448                 }
1449         }
1450         return 0;
1451 }
1452
1453 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1454 {
1455         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1456 }
1457
1458 static inline int guestwidth_to_adjustwidth(int gaw)
1459 {
1460         int agaw;
1461         int r = (gaw - 12) % 9;
1462
1463         if (r == 0)
1464                 agaw = gaw;
1465         else
1466                 agaw = gaw + 9 - r;
1467         if (agaw > 64)
1468                 agaw = 64;
1469         return agaw;
1470 }
1471
1472 static int domain_init(struct dmar_domain *domain, int guest_width)
1473 {
1474         struct intel_iommu *iommu;
1475         int adjust_width, agaw;
1476         unsigned long sagaw;
1477
1478         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1479         spin_lock_init(&domain->iommu_lock);
1480
1481         domain_reserve_special_ranges(domain);
1482
1483         /* calculate AGAW */
1484         iommu = domain_get_iommu(domain);
1485         if (guest_width > cap_mgaw(iommu->cap))
1486                 guest_width = cap_mgaw(iommu->cap);
1487         domain->gaw = guest_width;
1488         adjust_width = guestwidth_to_adjustwidth(guest_width);
1489         agaw = width_to_agaw(adjust_width);
1490         sagaw = cap_sagaw(iommu->cap);
1491         if (!test_bit(agaw, &sagaw)) {
1492                 /* hardware doesn't support it, choose a bigger one */
1493                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1494                 agaw = find_next_bit(&sagaw, 5, agaw);
1495                 if (agaw >= 5)
1496                         return -ENODEV;
1497         }
1498         domain->agaw = agaw;
1499         INIT_LIST_HEAD(&domain->devices);
1500
1501         if (ecap_coherent(iommu->ecap))
1502                 domain->iommu_coherency = 1;
1503         else
1504                 domain->iommu_coherency = 0;
1505
1506         if (ecap_sc_support(iommu->ecap))
1507                 domain->iommu_snooping = 1;
1508         else
1509                 domain->iommu_snooping = 0;
1510
1511         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1512         domain->iommu_count = 1;
1513         domain->nid = iommu->node;
1514
1515         /* always allocate the top pgd */
1516         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1517         if (!domain->pgd)
1518                 return -ENOMEM;
1519         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1520         return 0;
1521 }
1522
1523 static void domain_exit(struct dmar_domain *domain)
1524 {
1525         struct dmar_drhd_unit *drhd;
1526         struct intel_iommu *iommu;
1527
1528         /* Domain 0 is reserved, so dont process it */
1529         if (!domain)
1530                 return;
1531
1532         /* Flush any lazy unmaps that may reference this domain */
1533         if (!intel_iommu_strict)
1534                 flush_unmaps_timeout(0);
1535
1536         domain_remove_dev_info(domain);
1537         /* destroy iovas */
1538         put_iova_domain(&domain->iovad);
1539
1540         /* clear ptes */
1541         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1542
1543         /* free page tables */
1544         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1545
1546         for_each_active_iommu(iommu, drhd)
1547                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1548                         iommu_detach_domain(domain, iommu);
1549
1550         free_domain_mem(domain);
1551 }
1552
1553 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1554                                  u8 bus, u8 devfn, int translation)
1555 {
1556         struct context_entry *context;
1557         unsigned long flags;
1558         struct intel_iommu *iommu;
1559         struct dma_pte *pgd;
1560         unsigned long num;
1561         unsigned long ndomains;
1562         int id;
1563         int agaw;
1564         struct device_domain_info *info = NULL;
1565
1566         pr_debug("Set context mapping for %02x:%02x.%d\n",
1567                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1568
1569         BUG_ON(!domain->pgd);
1570         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1571                translation != CONTEXT_TT_MULTI_LEVEL);
1572
1573         iommu = device_to_iommu(segment, bus, devfn);
1574         if (!iommu)
1575                 return -ENODEV;
1576
1577         context = device_to_context_entry(iommu, bus, devfn);
1578         if (!context)
1579                 return -ENOMEM;
1580         spin_lock_irqsave(&iommu->lock, flags);
1581         if (context_present(context)) {
1582                 spin_unlock_irqrestore(&iommu->lock, flags);
1583                 return 0;
1584         }
1585
1586         id = domain->id;
1587         pgd = domain->pgd;
1588
1589         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1590             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1591                 int found = 0;
1592
1593                 /* find an available domain id for this device in iommu */
1594                 ndomains = cap_ndoms(iommu->cap);
1595                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1596                         if (iommu->domains[num] == domain) {
1597                                 id = num;
1598                                 found = 1;
1599                                 break;
1600                         }
1601                 }
1602
1603                 if (found == 0) {
1604                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1605                         if (num >= ndomains) {
1606                                 spin_unlock_irqrestore(&iommu->lock, flags);
1607                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1608                                 return -EFAULT;
1609                         }
1610
1611                         set_bit(num, iommu->domain_ids);
1612                         iommu->domains[num] = domain;
1613                         id = num;
1614                 }
1615
1616                 /* Skip top levels of page tables for
1617                  * iommu which has less agaw than default.
1618                  * Unnecessary for PT mode.
1619                  */
1620                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1621                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1622                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1623                                 if (!dma_pte_present(pgd)) {
1624                                         spin_unlock_irqrestore(&iommu->lock, flags);
1625                                         return -ENOMEM;
1626                                 }
1627                         }
1628                 }
1629         }
1630
1631         context_set_domain_id(context, id);
1632
1633         if (translation != CONTEXT_TT_PASS_THROUGH) {
1634                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1635                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1636                                      CONTEXT_TT_MULTI_LEVEL;
1637         }
1638         /*
1639          * In pass through mode, AW must be programmed to indicate the largest
1640          * AGAW value supported by hardware. And ASR is ignored by hardware.
1641          */
1642         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1643                 context_set_address_width(context, iommu->msagaw);
1644         else {
1645                 context_set_address_root(context, virt_to_phys(pgd));
1646                 context_set_address_width(context, iommu->agaw);
1647         }
1648
1649         context_set_translation_type(context, translation);
1650         context_set_fault_enable(context);
1651         context_set_present(context);
1652         domain_flush_cache(domain, context, sizeof(*context));
1653
1654         /*
1655          * It's a non-present to present mapping. If hardware doesn't cache
1656          * non-present entry we only need to flush the write-buffer. If the
1657          * _does_ cache non-present entries, then it does so in the special
1658          * domain #0, which we have to flush:
1659          */
1660         if (cap_caching_mode(iommu->cap)) {
1661                 iommu->flush.flush_context(iommu, 0,
1662                                            (((u16)bus) << 8) | devfn,
1663                                            DMA_CCMD_MASK_NOBIT,
1664                                            DMA_CCMD_DEVICE_INVL);
1665                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1666         } else {
1667                 iommu_flush_write_buffer(iommu);
1668         }
1669         iommu_enable_dev_iotlb(info);
1670         spin_unlock_irqrestore(&iommu->lock, flags);
1671
1672         spin_lock_irqsave(&domain->iommu_lock, flags);
1673         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1674                 domain->iommu_count++;
1675                 if (domain->iommu_count == 1)
1676                         domain->nid = iommu->node;
1677                 domain_update_iommu_cap(domain);
1678         }
1679         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1680         return 0;
1681 }
1682
1683 static int
1684 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1685                         int translation)
1686 {
1687         int ret;
1688         struct pci_dev *tmp, *parent;
1689
1690         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1691                                          pdev->bus->number, pdev->devfn,
1692                                          translation);
1693         if (ret)
1694                 return ret;
1695
1696         /* dependent device mapping */
1697         tmp = pci_find_upstream_pcie_bridge(pdev);
1698         if (!tmp)
1699                 return 0;
1700         /* Secondary interface's bus number and devfn 0 */
1701         parent = pdev->bus->self;
1702         while (parent != tmp) {
1703                 ret = domain_context_mapping_one(domain,
1704                                                  pci_domain_nr(parent->bus),
1705                                                  parent->bus->number,
1706                                                  parent->devfn, translation);
1707                 if (ret)
1708                         return ret;
1709                 parent = parent->bus->self;
1710         }
1711         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1712                 return domain_context_mapping_one(domain,
1713                                         pci_domain_nr(tmp->subordinate),
1714                                         tmp->subordinate->number, 0,
1715                                         translation);
1716         else /* this is a legacy PCI bridge */
1717                 return domain_context_mapping_one(domain,
1718                                                   pci_domain_nr(tmp->bus),
1719                                                   tmp->bus->number,
1720                                                   tmp->devfn,
1721                                                   translation);
1722 }
1723
1724 static int domain_context_mapped(struct pci_dev *pdev)
1725 {
1726         int ret;
1727         struct pci_dev *tmp, *parent;
1728         struct intel_iommu *iommu;
1729
1730         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1731                                 pdev->devfn);
1732         if (!iommu)
1733                 return -ENODEV;
1734
1735         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1736         if (!ret)
1737                 return ret;
1738         /* dependent device mapping */
1739         tmp = pci_find_upstream_pcie_bridge(pdev);
1740         if (!tmp)
1741                 return ret;
1742         /* Secondary interface's bus number and devfn 0 */
1743         parent = pdev->bus->self;
1744         while (parent != tmp) {
1745                 ret = device_context_mapped(iommu, parent->bus->number,
1746                                             parent->devfn);
1747                 if (!ret)
1748                         return ret;
1749                 parent = parent->bus->self;
1750         }
1751         if (pci_is_pcie(tmp))
1752                 return device_context_mapped(iommu, tmp->subordinate->number,
1753                                              0);
1754         else
1755                 return device_context_mapped(iommu, tmp->bus->number,
1756                                              tmp->devfn);
1757 }
1758
1759 /* Returns a number of VTD pages, but aligned to MM page size */
1760 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1761                                             size_t size)
1762 {
1763         host_addr &= ~PAGE_MASK;
1764         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1765 }
1766
1767 /* Return largest possible superpage level for a given mapping */
1768 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1769                                           unsigned long iov_pfn,
1770                                           unsigned long phy_pfn,
1771                                           unsigned long pages)
1772 {
1773         int support, level = 1;
1774         unsigned long pfnmerge;
1775
1776         support = domain->iommu_superpage;
1777
1778         /* To use a large page, the virtual *and* physical addresses
1779            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1780            of them will mean we have to use smaller pages. So just
1781            merge them and check both at once. */
1782         pfnmerge = iov_pfn | phy_pfn;
1783
1784         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1785                 pages >>= VTD_STRIDE_SHIFT;
1786                 if (!pages)
1787                         break;
1788                 pfnmerge >>= VTD_STRIDE_SHIFT;
1789                 level++;
1790                 support--;
1791         }
1792         return level;
1793 }
1794
1795 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1796                             struct scatterlist *sg, unsigned long phys_pfn,
1797                             unsigned long nr_pages, int prot)
1798 {
1799         struct dma_pte *first_pte = NULL, *pte = NULL;
1800         phys_addr_t uninitialized_var(pteval);
1801         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1802         unsigned long sg_res;
1803         unsigned int largepage_lvl = 0;
1804         unsigned long lvl_pages = 0;
1805
1806         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1807
1808         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1809                 return -EINVAL;
1810
1811         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1812
1813         if (sg)
1814                 sg_res = 0;
1815         else {
1816                 sg_res = nr_pages + 1;
1817                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1818         }
1819
1820         while (nr_pages > 0) {
1821                 uint64_t tmp;
1822
1823                 if (!sg_res) {
1824                         sg_res = aligned_nrpages(sg->offset, sg->length);
1825                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1826                         sg->dma_length = sg->length;
1827                         pteval = page_to_phys(sg_page(sg)) | prot;
1828                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1829                 }
1830
1831                 if (!pte) {
1832                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1833
1834                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1835                         if (!pte)
1836                                 return -ENOMEM;
1837                         /* It is large page*/
1838                         if (largepage_lvl > 1) {
1839                                 pteval |= DMA_PTE_LARGE_PAGE;
1840                                 /* Ensure that old small page tables are removed to make room
1841                                    for superpage, if they exist. */
1842                                 dma_pte_clear_range(domain, iov_pfn,
1843                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1844                                 dma_pte_free_pagetable(domain, iov_pfn,
1845                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1846                         } else {
1847                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1848                         }
1849
1850                 }
1851                 /* We don't need lock here, nobody else
1852                  * touches the iova range
1853                  */
1854                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1855                 if (tmp) {
1856                         static int dumps = 5;
1857                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1858                                iov_pfn, tmp, (unsigned long long)pteval);
1859                         if (dumps) {
1860                                 dumps--;
1861                                 debug_dma_dump_mappings(NULL);
1862                         }
1863                         WARN_ON(1);
1864                 }
1865
1866                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1867
1868                 BUG_ON(nr_pages < lvl_pages);
1869                 BUG_ON(sg_res < lvl_pages);
1870
1871                 nr_pages -= lvl_pages;
1872                 iov_pfn += lvl_pages;
1873                 phys_pfn += lvl_pages;
1874                 pteval += lvl_pages * VTD_PAGE_SIZE;
1875                 sg_res -= lvl_pages;
1876
1877                 /* If the next PTE would be the first in a new page, then we
1878                    need to flush the cache on the entries we've just written.
1879                    And then we'll need to recalculate 'pte', so clear it and
1880                    let it get set again in the if (!pte) block above.
1881
1882                    If we're done (!nr_pages) we need to flush the cache too.
1883
1884                    Also if we've been setting superpages, we may need to
1885                    recalculate 'pte' and switch back to smaller pages for the
1886                    end of the mapping, if the trailing size is not enough to
1887                    use another superpage (i.e. sg_res < lvl_pages). */
1888                 pte++;
1889                 if (!nr_pages || first_pte_in_page(pte) ||
1890                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1891                         domain_flush_cache(domain, first_pte,
1892                                            (void *)pte - (void *)first_pte);
1893                         pte = NULL;
1894                 }
1895
1896                 if (!sg_res && nr_pages)
1897                         sg = sg_next(sg);
1898         }
1899         return 0;
1900 }
1901
1902 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1903                                     struct scatterlist *sg, unsigned long nr_pages,
1904                                     int prot)
1905 {
1906         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1907 }
1908
1909 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1910                                      unsigned long phys_pfn, unsigned long nr_pages,
1911                                      int prot)
1912 {
1913         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1914 }
1915
1916 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1917 {
1918         if (!iommu)
1919                 return;
1920
1921         clear_context_table(iommu, bus, devfn);
1922         iommu->flush.flush_context(iommu, 0, 0, 0,
1923                                            DMA_CCMD_GLOBAL_INVL);
1924         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1925 }
1926
1927 static inline void unlink_domain_info(struct device_domain_info *info)
1928 {
1929         assert_spin_locked(&device_domain_lock);
1930         list_del(&info->link);
1931         list_del(&info->global);
1932         if (info->dev)
1933                 info->dev->dev.archdata.iommu = NULL;
1934 }
1935
1936 static void domain_remove_dev_info(struct dmar_domain *domain)
1937 {
1938         struct device_domain_info *info;
1939         unsigned long flags;
1940         struct intel_iommu *iommu;
1941
1942         spin_lock_irqsave(&device_domain_lock, flags);
1943         while (!list_empty(&domain->devices)) {
1944                 info = list_entry(domain->devices.next,
1945                         struct device_domain_info, link);
1946                 unlink_domain_info(info);
1947                 spin_unlock_irqrestore(&device_domain_lock, flags);
1948
1949                 iommu_disable_dev_iotlb(info);
1950                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1951                 iommu_detach_dev(iommu, info->bus, info->devfn);
1952                 free_devinfo_mem(info);
1953
1954                 spin_lock_irqsave(&device_domain_lock, flags);
1955         }
1956         spin_unlock_irqrestore(&device_domain_lock, flags);
1957 }
1958
1959 /*
1960  * find_domain
1961  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1962  */
1963 static struct dmar_domain *
1964 find_domain(struct pci_dev *pdev)
1965 {
1966         struct device_domain_info *info;
1967
1968         /* No lock here, assumes no domain exit in normal case */
1969         info = pdev->dev.archdata.iommu;
1970         if (info)
1971                 return info->domain;
1972         return NULL;
1973 }
1974
1975 /* domain is initialized */
1976 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1977 {
1978         struct dmar_domain *domain, *found = NULL;
1979         struct intel_iommu *iommu;
1980         struct dmar_drhd_unit *drhd;
1981         struct device_domain_info *info, *tmp;
1982         struct pci_dev *dev_tmp;
1983         unsigned long flags;
1984         int bus = 0, devfn = 0;
1985         int segment;
1986         int ret;
1987
1988         domain = find_domain(pdev);
1989         if (domain)
1990                 return domain;
1991
1992         segment = pci_domain_nr(pdev->bus);
1993
1994         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1995         if (dev_tmp) {
1996                 if (pci_is_pcie(dev_tmp)) {
1997                         bus = dev_tmp->subordinate->number;
1998                         devfn = 0;
1999                 } else {
2000                         bus = dev_tmp->bus->number;
2001                         devfn = dev_tmp->devfn;
2002                 }
2003                 spin_lock_irqsave(&device_domain_lock, flags);
2004                 list_for_each_entry(info, &device_domain_list, global) {
2005                         if (info->segment == segment &&
2006                             info->bus == bus && info->devfn == devfn) {
2007                                 found = info->domain;
2008                                 break;
2009                         }
2010                 }
2011                 spin_unlock_irqrestore(&device_domain_lock, flags);
2012                 /* pcie-pci bridge already has a domain, uses it */
2013                 if (found) {
2014                         domain = found;
2015                         goto found_domain;
2016                 }
2017         }
2018
2019         domain = alloc_domain();
2020         if (!domain)
2021                 goto error;
2022
2023         /* Allocate new domain for the device */
2024         drhd = dmar_find_matched_drhd_unit(pdev);
2025         if (!drhd) {
2026                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2027                         pci_name(pdev));
2028                 free_domain_mem(domain);
2029                 return NULL;
2030         }
2031         iommu = drhd->iommu;
2032
2033         ret = iommu_attach_domain(domain, iommu);
2034         if (ret) {
2035                 free_domain_mem(domain);
2036                 goto error;
2037         }
2038
2039         if (domain_init(domain, gaw)) {
2040                 domain_exit(domain);
2041                 goto error;
2042         }
2043
2044         /* register pcie-to-pci device */
2045         if (dev_tmp) {
2046                 info = alloc_devinfo_mem();
2047                 if (!info) {
2048                         domain_exit(domain);
2049                         goto error;
2050                 }
2051                 info->segment = segment;
2052                 info->bus = bus;
2053                 info->devfn = devfn;
2054                 info->dev = NULL;
2055                 info->domain = domain;
2056                 /* This domain is shared by devices under p2p bridge */
2057                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2058
2059                 /* pcie-to-pci bridge already has a domain, uses it */
2060                 found = NULL;
2061                 spin_lock_irqsave(&device_domain_lock, flags);
2062                 list_for_each_entry(tmp, &device_domain_list, global) {
2063                         if (tmp->segment == segment &&
2064                             tmp->bus == bus && tmp->devfn == devfn) {
2065                                 found = tmp->domain;
2066                                 break;
2067                         }
2068                 }
2069                 if (found) {
2070                         spin_unlock_irqrestore(&device_domain_lock, flags);
2071                         free_devinfo_mem(info);
2072                         domain_exit(domain);
2073                         domain = found;
2074                 } else {
2075                         list_add(&info->link, &domain->devices);
2076                         list_add(&info->global, &device_domain_list);
2077                         spin_unlock_irqrestore(&device_domain_lock, flags);
2078                 }
2079         }
2080
2081 found_domain:
2082         info = alloc_devinfo_mem();
2083         if (!info)
2084                 goto error;
2085         info->segment = segment;
2086         info->bus = pdev->bus->number;
2087         info->devfn = pdev->devfn;
2088         info->dev = pdev;
2089         info->domain = domain;
2090         spin_lock_irqsave(&device_domain_lock, flags);
2091         /* somebody is fast */
2092         found = find_domain(pdev);
2093         if (found != NULL) {
2094                 spin_unlock_irqrestore(&device_domain_lock, flags);
2095                 if (found != domain) {
2096                         domain_exit(domain);
2097                         domain = found;
2098                 }
2099                 free_devinfo_mem(info);
2100                 return domain;
2101         }
2102         list_add(&info->link, &domain->devices);
2103         list_add(&info->global, &device_domain_list);
2104         pdev->dev.archdata.iommu = info;
2105         spin_unlock_irqrestore(&device_domain_lock, flags);
2106         return domain;
2107 error:
2108         /* recheck it here, maybe others set it */
2109         return find_domain(pdev);
2110 }
2111
2112 static int iommu_identity_mapping;
2113 #define IDENTMAP_ALL            1
2114 #define IDENTMAP_GFX            2
2115 #define IDENTMAP_AZALIA         4
2116
2117 static int iommu_domain_identity_map(struct dmar_domain *domain,
2118                                      unsigned long long start,
2119                                      unsigned long long end)
2120 {
2121         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2122         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2123
2124         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2125                           dma_to_mm_pfn(last_vpfn))) {
2126                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2127                 return -ENOMEM;
2128         }
2129
2130         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2131                  start, end, domain->id);
2132         /*
2133          * RMRR range might have overlap with physical memory range,
2134          * clear it first
2135          */
2136         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2137
2138         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2139                                   last_vpfn - first_vpfn + 1,
2140                                   DMA_PTE_READ|DMA_PTE_WRITE);
2141 }
2142
2143 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2144                                       unsigned long long start,
2145                                       unsigned long long end)
2146 {
2147         struct dmar_domain *domain;
2148         int ret;
2149
2150         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2151         if (!domain)
2152                 return -ENOMEM;
2153
2154         /* For _hardware_ passthrough, don't bother. But for software
2155            passthrough, we do it anyway -- it may indicate a memory
2156            range which is reserved in E820, so which didn't get set
2157            up to start with in si_domain */
2158         if (domain == si_domain && hw_pass_through) {
2159                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2160                        pci_name(pdev), start, end);
2161                 return 0;
2162         }
2163
2164         printk(KERN_INFO
2165                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2166                pci_name(pdev), start, end);
2167         
2168         if (end < start) {
2169                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2170                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2171                         dmi_get_system_info(DMI_BIOS_VENDOR),
2172                         dmi_get_system_info(DMI_BIOS_VERSION),
2173                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2174                 ret = -EIO;
2175                 goto error;
2176         }
2177
2178         if (end >> agaw_to_width(domain->agaw)) {
2179                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2180                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2181                      agaw_to_width(domain->agaw),
2182                      dmi_get_system_info(DMI_BIOS_VENDOR),
2183                      dmi_get_system_info(DMI_BIOS_VERSION),
2184                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2185                 ret = -EIO;
2186                 goto error;
2187         }
2188
2189         ret = iommu_domain_identity_map(domain, start, end);
2190         if (ret)
2191                 goto error;
2192
2193         /* context entry init */
2194         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2195         if (ret)
2196                 goto error;
2197
2198         return 0;
2199
2200  error:
2201         domain_exit(domain);
2202         return ret;
2203 }
2204
2205 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2206         struct pci_dev *pdev)
2207 {
2208         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2209                 return 0;
2210         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2211                 rmrr->end_address);
2212 }
2213
2214 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2215 static inline void iommu_prepare_isa(void)
2216 {
2217         struct pci_dev *pdev;
2218         int ret;
2219
2220         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2221         if (!pdev)
2222                 return;
2223
2224         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2225         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2226
2227         if (ret)
2228                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2229                        "floppy might not work\n");
2230
2231 }
2232 #else
2233 static inline void iommu_prepare_isa(void)
2234 {
2235         return;
2236 }
2237 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2238
2239 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2240
2241 static int __init si_domain_init(int hw)
2242 {
2243         struct dmar_drhd_unit *drhd;
2244         struct intel_iommu *iommu;
2245         int nid, ret = 0;
2246
2247         si_domain = alloc_domain();
2248         if (!si_domain)
2249                 return -EFAULT;
2250
2251         for_each_active_iommu(iommu, drhd) {
2252                 ret = iommu_attach_domain(si_domain, iommu);
2253                 if (ret) {
2254                         domain_exit(si_domain);
2255                         return -EFAULT;
2256                 }
2257         }
2258
2259         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2260                 domain_exit(si_domain);
2261                 return -EFAULT;
2262         }
2263
2264         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2265         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2266                  si_domain->id);
2267
2268         if (hw)
2269                 return 0;
2270
2271         for_each_online_node(nid) {
2272                 unsigned long start_pfn, end_pfn;
2273                 int i;
2274
2275                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2276                         ret = iommu_domain_identity_map(si_domain,
2277                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2278                         if (ret)
2279                                 return ret;
2280                 }
2281         }
2282
2283         return 0;
2284 }
2285
2286 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2287                                           struct pci_dev *pdev);
2288 static int identity_mapping(struct pci_dev *pdev)
2289 {
2290         struct device_domain_info *info;
2291
2292         if (likely(!iommu_identity_mapping))
2293                 return 0;
2294
2295         info = pdev->dev.archdata.iommu;
2296         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2297                 return (info->domain == si_domain);
2298
2299         return 0;
2300 }
2301
2302 static int domain_add_dev_info(struct dmar_domain *domain,
2303                                struct pci_dev *pdev,
2304                                int translation)
2305 {
2306         struct device_domain_info *info;
2307         unsigned long flags;
2308         int ret;
2309
2310         info = alloc_devinfo_mem();
2311         if (!info)
2312                 return -ENOMEM;
2313
2314         info->segment = pci_domain_nr(pdev->bus);
2315         info->bus = pdev->bus->number;
2316         info->devfn = pdev->devfn;
2317         info->dev = pdev;
2318         info->domain = domain;
2319
2320         spin_lock_irqsave(&device_domain_lock, flags);
2321         list_add(&info->link, &domain->devices);
2322         list_add(&info->global, &device_domain_list);
2323         pdev->dev.archdata.iommu = info;
2324         spin_unlock_irqrestore(&device_domain_lock, flags);
2325
2326         ret = domain_context_mapping(domain, pdev, translation);
2327         if (ret) {
2328                 spin_lock_irqsave(&device_domain_lock, flags);
2329                 unlink_domain_info(info);
2330                 spin_unlock_irqrestore(&device_domain_lock, flags);
2331                 free_devinfo_mem(info);
2332                 return ret;
2333         }
2334
2335         return 0;
2336 }
2337
2338 static bool device_has_rmrr(struct pci_dev *dev)
2339 {
2340         struct dmar_rmrr_unit *rmrr;
2341         int i;
2342
2343         for_each_rmrr_units(rmrr) {
2344                 for (i = 0; i < rmrr->devices_cnt; i++) {
2345                         /*
2346                          * Return TRUE if this RMRR contains the device that
2347                          * is passed in.
2348                          */
2349                         if (rmrr->devices[i] == dev)
2350                                 return true;
2351                 }
2352         }
2353         return false;
2354 }
2355
2356 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2357 {
2358
2359         /*
2360          * We want to prevent any device associated with an RMRR from
2361          * getting placed into the SI Domain. This is done because
2362          * problems exist when devices are moved in and out of domains
2363          * and their respective RMRR info is lost. We exempt USB devices
2364          * from this process due to their usage of RMRRs that are known
2365          * to not be needed after BIOS hand-off to OS.
2366          */
2367         if (device_has_rmrr(pdev) &&
2368             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2369                 return 0;
2370
2371         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2372                 return 1;
2373
2374         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2375                 return 1;
2376
2377         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2378                 return 0;
2379
2380         /*
2381          * We want to start off with all devices in the 1:1 domain, and
2382          * take them out later if we find they can't access all of memory.
2383          *
2384          * However, we can't do this for PCI devices behind bridges,
2385          * because all PCI devices behind the same bridge will end up
2386          * with the same source-id on their transactions.
2387          *
2388          * Practically speaking, we can't change things around for these
2389          * devices at run-time, because we can't be sure there'll be no
2390          * DMA transactions in flight for any of their siblings.
2391          * 
2392          * So PCI devices (unless they're on the root bus) as well as
2393          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2394          * the 1:1 domain, just in _case_ one of their siblings turns out
2395          * not to be able to map all of memory.
2396          */
2397         if (!pci_is_pcie(pdev)) {
2398                 if (!pci_is_root_bus(pdev->bus))
2399                         return 0;
2400                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2401                         return 0;
2402         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2403                 return 0;
2404
2405         /* 
2406          * At boot time, we don't yet know if devices will be 64-bit capable.
2407          * Assume that they will -- if they turn out not to be, then we can 
2408          * take them out of the 1:1 domain later.
2409          */
2410         if (!startup) {
2411                 /*
2412                  * If the device's dma_mask is less than the system's memory
2413                  * size then this is not a candidate for identity mapping.
2414                  */
2415                 u64 dma_mask = pdev->dma_mask;
2416
2417                 if (pdev->dev.coherent_dma_mask &&
2418                     pdev->dev.coherent_dma_mask < dma_mask)
2419                         dma_mask = pdev->dev.coherent_dma_mask;
2420
2421                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2422         }
2423
2424         return 1;
2425 }
2426
2427 static int __init iommu_prepare_static_identity_mapping(int hw)
2428 {
2429         struct pci_dev *pdev = NULL;
2430         int ret;
2431
2432         ret = si_domain_init(hw);
2433         if (ret)
2434                 return -EFAULT;
2435
2436         for_each_pci_dev(pdev) {
2437                 if (iommu_should_identity_map(pdev, 1)) {
2438                         ret = domain_add_dev_info(si_domain, pdev,
2439                                              hw ? CONTEXT_TT_PASS_THROUGH :
2440                                                   CONTEXT_TT_MULTI_LEVEL);
2441                         if (ret) {
2442                                 /* device not associated with an iommu */
2443                                 if (ret == -ENODEV)
2444                                         continue;
2445                                 return ret;
2446                         }
2447                         pr_info("IOMMU: %s identity mapping for device %s\n",
2448                                 hw ? "hardware" : "software", pci_name(pdev));
2449                 }
2450         }
2451
2452         return 0;
2453 }
2454
2455 static int __init init_dmars(void)
2456 {
2457         struct dmar_drhd_unit *drhd;
2458         struct dmar_rmrr_unit *rmrr;
2459         struct pci_dev *pdev;
2460         struct intel_iommu *iommu;
2461         int i, ret;
2462
2463         /*
2464          * for each drhd
2465          *    allocate root
2466          *    initialize and program root entry to not present
2467          * endfor
2468          */
2469         for_each_drhd_unit(drhd) {
2470                 /*
2471                  * lock not needed as this is only incremented in the single
2472                  * threaded kernel __init code path all other access are read
2473                  * only
2474                  */
2475                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2476                         g_num_of_iommus++;
2477                         continue;
2478                 }
2479                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2480                           IOMMU_UNITS_SUPPORTED);
2481         }
2482
2483         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2484                         GFP_KERNEL);
2485         if (!g_iommus) {
2486                 printk(KERN_ERR "Allocating global iommu array failed\n");
2487                 ret = -ENOMEM;
2488                 goto error;
2489         }
2490
2491         deferred_flush = kzalloc(g_num_of_iommus *
2492                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2493         if (!deferred_flush) {
2494                 ret = -ENOMEM;
2495                 goto error;
2496         }
2497
2498         for_each_drhd_unit(drhd) {
2499                 if (drhd->ignored)
2500                         continue;
2501
2502                 iommu = drhd->iommu;
2503                 g_iommus[iommu->seq_id] = iommu;
2504
2505                 ret = iommu_init_domains(iommu);
2506                 if (ret)
2507                         goto error;
2508
2509                 /*
2510                  * TBD:
2511                  * we could share the same root & context tables
2512                  * among all IOMMU's. Need to Split it later.
2513                  */
2514                 ret = iommu_alloc_root_entry(iommu);
2515                 if (ret) {
2516                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2517                         goto error;
2518                 }
2519                 if (!ecap_pass_through(iommu->ecap))
2520                         hw_pass_through = 0;
2521         }
2522
2523         /*
2524          * Start from the sane iommu hardware state.
2525          */
2526         for_each_drhd_unit(drhd) {
2527                 if (drhd->ignored)
2528                         continue;
2529
2530                 iommu = drhd->iommu;
2531
2532                 /*
2533                  * If the queued invalidation is already initialized by us
2534                  * (for example, while enabling interrupt-remapping) then
2535                  * we got the things already rolling from a sane state.
2536                  */
2537                 if (iommu->qi)
2538                         continue;
2539
2540                 /*
2541                  * Clear any previous faults.
2542                  */
2543                 dmar_fault(-1, iommu);
2544                 /*
2545                  * Disable queued invalidation if supported and already enabled
2546                  * before OS handover.
2547                  */
2548                 dmar_disable_qi(iommu);
2549         }
2550
2551         for_each_drhd_unit(drhd) {
2552                 if (drhd->ignored)
2553                         continue;
2554
2555                 iommu = drhd->iommu;
2556
2557                 if (dmar_enable_qi(iommu)) {
2558                         /*
2559                          * Queued Invalidate not enabled, use Register Based
2560                          * Invalidate
2561                          */
2562                         iommu->flush.flush_context = __iommu_flush_context;
2563                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2564                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2565                                "invalidation\n",
2566                                 iommu->seq_id,
2567                                (unsigned long long)drhd->reg_base_addr);
2568                 } else {
2569                         iommu->flush.flush_context = qi_flush_context;
2570                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2571                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2572                                "invalidation\n",
2573                                 iommu->seq_id,
2574                                (unsigned long long)drhd->reg_base_addr);
2575                 }
2576         }
2577
2578         if (iommu_pass_through)
2579                 iommu_identity_mapping |= IDENTMAP_ALL;
2580
2581 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2582         iommu_identity_mapping |= IDENTMAP_GFX;
2583 #endif
2584
2585         check_tylersburg_isoch();
2586
2587         /*
2588          * If pass through is not set or not enabled, setup context entries for
2589          * identity mappings for rmrr, gfx, and isa and may fall back to static
2590          * identity mapping if iommu_identity_mapping is set.
2591          */
2592         if (iommu_identity_mapping) {
2593                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2594                 if (ret) {
2595                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2596                         goto error;
2597                 }
2598         }
2599         /*
2600          * For each rmrr
2601          *   for each dev attached to rmrr
2602          *   do
2603          *     locate drhd for dev, alloc domain for dev
2604          *     allocate free domain
2605          *     allocate page table entries for rmrr
2606          *     if context not allocated for bus
2607          *           allocate and init context
2608          *           set present in root table for this bus
2609          *     init context with domain, translation etc
2610          *    endfor
2611          * endfor
2612          */
2613         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2614         for_each_rmrr_units(rmrr) {
2615                 for (i = 0; i < rmrr->devices_cnt; i++) {
2616                         pdev = rmrr->devices[i];
2617                         /*
2618                          * some BIOS lists non-exist devices in DMAR
2619                          * table.
2620                          */
2621                         if (!pdev)
2622                                 continue;
2623                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2624                         if (ret)
2625                                 printk(KERN_ERR
2626                                        "IOMMU: mapping reserved region failed\n");
2627                 }
2628         }
2629
2630         iommu_prepare_isa();
2631
2632         /*
2633          * for each drhd
2634          *   enable fault log
2635          *   global invalidate context cache
2636          *   global invalidate iotlb
2637          *   enable translation
2638          */
2639         for_each_drhd_unit(drhd) {
2640                 if (drhd->ignored) {
2641                         /*
2642                          * we always have to disable PMRs or DMA may fail on
2643                          * this device
2644                          */
2645                         if (force_on)
2646                                 iommu_disable_protect_mem_regions(drhd->iommu);
2647                         continue;
2648                 }
2649                 iommu = drhd->iommu;
2650
2651                 iommu_flush_write_buffer(iommu);
2652
2653                 ret = dmar_set_interrupt(iommu);
2654                 if (ret)
2655                         goto error;
2656
2657                 iommu_set_root_entry(iommu);
2658
2659                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2660                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2661
2662                 ret = iommu_enable_translation(iommu);
2663                 if (ret)
2664                         goto error;
2665
2666                 iommu_disable_protect_mem_regions(iommu);
2667         }
2668
2669         return 0;
2670 error:
2671         for_each_drhd_unit(drhd) {
2672                 if (drhd->ignored)
2673                         continue;
2674                 iommu = drhd->iommu;
2675                 free_iommu(iommu);
2676         }
2677         kfree(g_iommus);
2678         return ret;
2679 }
2680
2681 /* This takes a number of _MM_ pages, not VTD pages */
2682 static struct iova *intel_alloc_iova(struct device *dev,
2683                                      struct dmar_domain *domain,
2684                                      unsigned long nrpages, uint64_t dma_mask)
2685 {
2686         struct pci_dev *pdev = to_pci_dev(dev);
2687         struct iova *iova = NULL;
2688
2689         /* Restrict dma_mask to the width that the iommu can handle */
2690         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2691
2692         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2693                 /*
2694                  * First try to allocate an io virtual address in
2695                  * DMA_BIT_MASK(32) and if that fails then try allocating
2696                  * from higher range
2697                  */
2698                 iova = alloc_iova(&domain->iovad, nrpages,
2699                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2700                 if (iova)
2701                         return iova;
2702         }
2703         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2704         if (unlikely(!iova)) {
2705                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2706                        nrpages, pci_name(pdev));
2707                 return NULL;
2708         }
2709
2710         return iova;
2711 }
2712
2713 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2714 {
2715         struct dmar_domain *domain;
2716         int ret;
2717
2718         domain = get_domain_for_dev(pdev,
2719                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2720         if (!domain) {
2721                 printk(KERN_ERR
2722                         "Allocating domain for %s failed", pci_name(pdev));
2723                 return NULL;
2724         }
2725
2726         /* make sure context mapping is ok */
2727         if (unlikely(!domain_context_mapped(pdev))) {
2728                 ret = domain_context_mapping(domain, pdev,
2729                                              CONTEXT_TT_MULTI_LEVEL);
2730                 if (ret) {
2731                         printk(KERN_ERR
2732                                 "Domain context map for %s failed",
2733                                 pci_name(pdev));
2734                         return NULL;
2735                 }
2736         }
2737
2738         return domain;
2739 }
2740
2741 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2742 {
2743         struct device_domain_info *info;
2744
2745         /* No lock here, assumes no domain exit in normal case */
2746         info = dev->dev.archdata.iommu;
2747         if (likely(info))
2748                 return info->domain;
2749
2750         return __get_valid_domain_for_dev(dev);
2751 }
2752
2753 static int iommu_dummy(struct pci_dev *pdev)
2754 {
2755         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2756 }
2757
2758 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2759 static int iommu_no_mapping(struct device *dev)
2760 {
2761         struct pci_dev *pdev;
2762         int found;
2763
2764         if (unlikely(!dev_is_pci(dev)))
2765                 return 1;
2766
2767         pdev = to_pci_dev(dev);
2768         if (iommu_dummy(pdev))
2769                 return 1;
2770
2771         if (!iommu_identity_mapping)
2772                 return 0;
2773
2774         found = identity_mapping(pdev);
2775         if (found) {
2776                 if (iommu_should_identity_map(pdev, 0))
2777                         return 1;
2778                 else {
2779                         /*
2780                          * 32 bit DMA is removed from si_domain and fall back
2781                          * to non-identity mapping.
2782                          */
2783                         domain_remove_one_dev_info(si_domain, pdev);
2784                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2785                                pci_name(pdev));
2786                         return 0;
2787                 }
2788         } else {
2789                 /*
2790                  * In case of a detached 64 bit DMA device from vm, the device
2791                  * is put into si_domain for identity mapping.
2792                  */
2793                 if (iommu_should_identity_map(pdev, 0)) {
2794                         int ret;
2795                         ret = domain_add_dev_info(si_domain, pdev,
2796                                                   hw_pass_through ?
2797                                                   CONTEXT_TT_PASS_THROUGH :
2798                                                   CONTEXT_TT_MULTI_LEVEL);
2799                         if (!ret) {
2800                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2801                                        pci_name(pdev));
2802                                 return 1;
2803                         }
2804                 }
2805         }
2806
2807         return 0;
2808 }
2809
2810 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2811                                      size_t size, int dir, u64 dma_mask)
2812 {
2813         struct pci_dev *pdev = to_pci_dev(hwdev);
2814         struct dmar_domain *domain;
2815         phys_addr_t start_paddr;
2816         struct iova *iova;
2817         int prot = 0;
2818         int ret;
2819         struct intel_iommu *iommu;
2820         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2821
2822         BUG_ON(dir == DMA_NONE);
2823
2824         if (iommu_no_mapping(hwdev))
2825                 return paddr;
2826
2827         domain = get_valid_domain_for_dev(pdev);
2828         if (!domain)
2829                 return 0;
2830
2831         iommu = domain_get_iommu(domain);
2832         size = aligned_nrpages(paddr, size);
2833
2834         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2835         if (!iova)
2836                 goto error;
2837
2838         /*
2839          * Check if DMAR supports zero-length reads on write only
2840          * mappings..
2841          */
2842         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2843                         !cap_zlr(iommu->cap))
2844                 prot |= DMA_PTE_READ;
2845         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2846                 prot |= DMA_PTE_WRITE;
2847         /*
2848          * paddr - (paddr + size) might be partial page, we should map the whole
2849          * page.  Note: if two part of one page are separately mapped, we
2850          * might have two guest_addr mapping to the same host paddr, but this
2851          * is not a big problem
2852          */
2853         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2854                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2855         if (ret)
2856                 goto error;
2857
2858         /* it's a non-present to present mapping. Only flush if caching mode */
2859         if (cap_caching_mode(iommu->cap))
2860                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2861         else
2862                 iommu_flush_write_buffer(iommu);
2863
2864         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2865         start_paddr += paddr & ~PAGE_MASK;
2866         return start_paddr;
2867
2868 error:
2869         if (iova)
2870                 __free_iova(&domain->iovad, iova);
2871         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2872                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2873         return 0;
2874 }
2875
2876 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2877                                  unsigned long offset, size_t size,
2878                                  enum dma_data_direction dir,
2879                                  struct dma_attrs *attrs)
2880 {
2881         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2882                                   dir, to_pci_dev(dev)->dma_mask);
2883 }
2884
2885 static void flush_unmaps(void)
2886 {
2887         int i, j;
2888
2889         timer_on = 0;
2890
2891         /* just flush them all */
2892         for (i = 0; i < g_num_of_iommus; i++) {
2893                 struct intel_iommu *iommu = g_iommus[i];
2894                 if (!iommu)
2895                         continue;
2896
2897                 if (!deferred_flush[i].next)
2898                         continue;
2899
2900                 /* In caching mode, global flushes turn emulation expensive */
2901                 if (!cap_caching_mode(iommu->cap))
2902                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2903                                          DMA_TLB_GLOBAL_FLUSH);
2904                 for (j = 0; j < deferred_flush[i].next; j++) {
2905                         unsigned long mask;
2906                         struct iova *iova = deferred_flush[i].iova[j];
2907                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2908
2909                         /* On real hardware multiple invalidations are expensive */
2910                         if (cap_caching_mode(iommu->cap))
2911                                 iommu_flush_iotlb_psi(iommu, domain->id,
2912                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2913                         else {
2914                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2915                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2916                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2917                         }
2918                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2919                 }
2920                 deferred_flush[i].next = 0;
2921         }
2922
2923         list_size = 0;
2924 }
2925
2926 static void flush_unmaps_timeout(unsigned long data)
2927 {
2928         unsigned long flags;
2929
2930         spin_lock_irqsave(&async_umap_flush_lock, flags);
2931         flush_unmaps();
2932         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2933 }
2934
2935 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2936 {
2937         unsigned long flags;
2938         int next, iommu_id;
2939         struct intel_iommu *iommu;
2940
2941         spin_lock_irqsave(&async_umap_flush_lock, flags);
2942         if (list_size == HIGH_WATER_MARK)
2943                 flush_unmaps();
2944
2945         iommu = domain_get_iommu(dom);
2946         iommu_id = iommu->seq_id;
2947
2948         next = deferred_flush[iommu_id].next;
2949         deferred_flush[iommu_id].domain[next] = dom;
2950         deferred_flush[iommu_id].iova[next] = iova;
2951         deferred_flush[iommu_id].next++;
2952
2953         if (!timer_on) {
2954                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2955                 timer_on = 1;
2956         }
2957         list_size++;
2958         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2959 }
2960
2961 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2962                              size_t size, enum dma_data_direction dir,
2963                              struct dma_attrs *attrs)
2964 {
2965         struct pci_dev *pdev = to_pci_dev(dev);
2966         struct dmar_domain *domain;
2967         unsigned long start_pfn, last_pfn;
2968         struct iova *iova;
2969         struct intel_iommu *iommu;
2970
2971         if (iommu_no_mapping(dev))
2972                 return;
2973
2974         domain = find_domain(pdev);
2975         BUG_ON(!domain);
2976
2977         iommu = domain_get_iommu(domain);
2978
2979         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2980         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2981                       (unsigned long long)dev_addr))
2982                 return;
2983
2984         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2985         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2986
2987         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2988                  pci_name(pdev), start_pfn, last_pfn);
2989
2990         /*  clear the whole page */
2991         dma_pte_clear_range(domain, start_pfn, last_pfn);
2992
2993         /* free page tables */
2994         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2995
2996         if (intel_iommu_strict) {
2997                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2998                                       last_pfn - start_pfn + 1, 0);
2999                 /* free iova */
3000                 __free_iova(&domain->iovad, iova);
3001         } else {
3002                 add_unmap(domain, iova);
3003                 /*
3004                  * queue up the release of the unmap to save the 1/6th of the
3005                  * cpu used up by the iotlb flush operation...
3006                  */
3007         }
3008 }
3009
3010 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3011                                   dma_addr_t *dma_handle, gfp_t flags,
3012                                   struct dma_attrs *attrs)
3013 {
3014         void *vaddr;
3015         int order;
3016
3017         size = PAGE_ALIGN(size);
3018         order = get_order(size);
3019
3020         if (!iommu_no_mapping(hwdev))
3021                 flags &= ~(GFP_DMA | GFP_DMA32);
3022         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3023                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3024                         flags |= GFP_DMA;
3025                 else
3026                         flags |= GFP_DMA32;
3027         }
3028
3029         vaddr = (void *)__get_free_pages(flags, order);
3030         if (!vaddr)
3031                 return NULL;
3032         memset(vaddr, 0, size);
3033
3034         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3035                                          DMA_BIDIRECTIONAL,
3036                                          hwdev->coherent_dma_mask);
3037         if (*dma_handle)
3038                 return vaddr;
3039         free_pages((unsigned long)vaddr, order);
3040         return NULL;
3041 }
3042
3043 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3044                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3045 {
3046         int order;
3047
3048         size = PAGE_ALIGN(size);
3049         order = get_order(size);
3050
3051         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3052         free_pages((unsigned long)vaddr, order);
3053 }
3054
3055 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3056                            int nelems, enum dma_data_direction dir,
3057                            struct dma_attrs *attrs)
3058 {
3059         struct pci_dev *pdev = to_pci_dev(hwdev);
3060         struct dmar_domain *domain;
3061         unsigned long start_pfn, last_pfn;
3062         struct iova *iova;
3063         struct intel_iommu *iommu;
3064
3065         if (iommu_no_mapping(hwdev))
3066                 return;
3067
3068         domain = find_domain(pdev);
3069         BUG_ON(!domain);
3070
3071         iommu = domain_get_iommu(domain);
3072
3073         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3074         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3075                       (unsigned long long)sglist[0].dma_address))
3076                 return;
3077
3078         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3079         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3080
3081         /*  clear the whole page */
3082         dma_pte_clear_range(domain, start_pfn, last_pfn);
3083
3084         /* free page tables */
3085         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3086
3087         if (intel_iommu_strict) {
3088                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3089                                       last_pfn - start_pfn + 1, 0);
3090                 /* free iova */
3091                 __free_iova(&domain->iovad, iova);
3092         } else {
3093                 add_unmap(domain, iova);
3094                 /*
3095                  * queue up the release of the unmap to save the 1/6th of the
3096                  * cpu used up by the iotlb flush operation...
3097                  */
3098         }
3099 }
3100
3101 static int intel_nontranslate_map_sg(struct device *hddev,
3102         struct scatterlist *sglist, int nelems, int dir)
3103 {
3104         int i;
3105         struct scatterlist *sg;
3106
3107         for_each_sg(sglist, sg, nelems, i) {
3108                 BUG_ON(!sg_page(sg));
3109                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3110                 sg->dma_length = sg->length;
3111         }
3112         return nelems;
3113 }
3114
3115 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3116                         enum dma_data_direction dir, struct dma_attrs *attrs)
3117 {
3118         int i;
3119         struct pci_dev *pdev = to_pci_dev(hwdev);
3120         struct dmar_domain *domain;
3121         size_t size = 0;
3122         int prot = 0;
3123         struct iova *iova = NULL;
3124         int ret;
3125         struct scatterlist *sg;
3126         unsigned long start_vpfn;
3127         struct intel_iommu *iommu;
3128
3129         BUG_ON(dir == DMA_NONE);
3130         if (iommu_no_mapping(hwdev))
3131                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3132
3133         domain = get_valid_domain_for_dev(pdev);
3134         if (!domain)
3135                 return 0;
3136
3137         iommu = domain_get_iommu(domain);
3138
3139         for_each_sg(sglist, sg, nelems, i)
3140                 size += aligned_nrpages(sg->offset, sg->length);
3141
3142         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3143                                 pdev->dma_mask);
3144         if (!iova) {
3145                 sglist->dma_length = 0;
3146                 return 0;
3147         }
3148
3149         /*
3150          * Check if DMAR supports zero-length reads on write only
3151          * mappings..
3152          */
3153         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3154                         !cap_zlr(iommu->cap))
3155                 prot |= DMA_PTE_READ;
3156         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3157                 prot |= DMA_PTE_WRITE;
3158
3159         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3160
3161         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3162         if (unlikely(ret)) {
3163                 /*  clear the page */
3164                 dma_pte_clear_range(domain, start_vpfn,
3165                                     start_vpfn + size - 1);
3166                 /* free page tables */
3167                 dma_pte_free_pagetable(domain, start_vpfn,
3168                                        start_vpfn + size - 1);
3169                 /* free iova */
3170                 __free_iova(&domain->iovad, iova);
3171                 return 0;
3172         }
3173
3174         /* it's a non-present to present mapping. Only flush if caching mode */
3175         if (cap_caching_mode(iommu->cap))
3176                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3177         else
3178                 iommu_flush_write_buffer(iommu);
3179
3180         return nelems;
3181 }
3182
3183 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3184 {
3185         return !dma_addr;
3186 }
3187
3188 struct dma_map_ops intel_dma_ops = {
3189         .alloc = intel_alloc_coherent,
3190         .free = intel_free_coherent,
3191         .map_sg = intel_map_sg,
3192         .unmap_sg = intel_unmap_sg,
3193         .map_page = intel_map_page,
3194         .unmap_page = intel_unmap_page,
3195         .mapping_error = intel_mapping_error,
3196 };
3197
3198 static inline int iommu_domain_cache_init(void)
3199 {
3200         int ret = 0;
3201
3202         iommu_domain_cache = kmem_cache_create("iommu_domain",
3203                                          sizeof(struct dmar_domain),
3204                                          0,
3205                                          SLAB_HWCACHE_ALIGN,
3206
3207                                          NULL);
3208         if (!iommu_domain_cache) {
3209                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3210                 ret = -ENOMEM;
3211         }
3212
3213         return ret;
3214 }
3215
3216 static inline int iommu_devinfo_cache_init(void)
3217 {
3218         int ret = 0;
3219
3220         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3221                                          sizeof(struct device_domain_info),
3222                                          0,
3223                                          SLAB_HWCACHE_ALIGN,
3224                                          NULL);
3225         if (!iommu_devinfo_cache) {
3226                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3227                 ret = -ENOMEM;
3228         }
3229
3230         return ret;
3231 }
3232
3233 static inline int iommu_iova_cache_init(void)
3234 {
3235         int ret = 0;
3236
3237         iommu_iova_cache = kmem_cache_create("iommu_iova",
3238                                          sizeof(struct iova),
3239                                          0,
3240                                          SLAB_HWCACHE_ALIGN,
3241                                          NULL);
3242         if (!iommu_iova_cache) {
3243                 printk(KERN_ERR "Couldn't create iova cache\n");
3244                 ret = -ENOMEM;
3245         }
3246
3247         return ret;
3248 }
3249
3250 static int __init iommu_init_mempool(void)
3251 {
3252         int ret;
3253         ret = iommu_iova_cache_init();
3254         if (ret)
3255                 return ret;
3256
3257         ret = iommu_domain_cache_init();
3258         if (ret)
3259                 goto domain_error;
3260
3261         ret = iommu_devinfo_cache_init();
3262         if (!ret)
3263                 return ret;
3264
3265         kmem_cache_destroy(iommu_domain_cache);
3266 domain_error:
3267         kmem_cache_destroy(iommu_iova_cache);
3268
3269         return -ENOMEM;
3270 }
3271
3272 static void __init iommu_exit_mempool(void)
3273 {
3274         kmem_cache_destroy(iommu_devinfo_cache);
3275         kmem_cache_destroy(iommu_domain_cache);
3276         kmem_cache_destroy(iommu_iova_cache);
3277
3278 }
3279
3280 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3281 {
3282         struct dmar_drhd_unit *drhd;
3283         u32 vtbar;
3284         int rc;
3285
3286         /* We know that this device on this chipset has its own IOMMU.
3287          * If we find it under a different IOMMU, then the BIOS is lying
3288          * to us. Hope that the IOMMU for this device is actually
3289          * disabled, and it needs no translation...
3290          */
3291         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3292         if (rc) {
3293                 /* "can't" happen */
3294                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3295                 return;
3296         }
3297         vtbar &= 0xffff0000;
3298
3299         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3300         drhd = dmar_find_matched_drhd_unit(pdev);
3301         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3302                             TAINT_FIRMWARE_WORKAROUND,
3303                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3304                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3305 }
3306 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3307
3308 static void __init init_no_remapping_devices(void)
3309 {
3310         struct dmar_drhd_unit *drhd;
3311
3312         for_each_drhd_unit(drhd) {
3313                 if (!drhd->include_all) {
3314                         int i;
3315                         for (i = 0; i < drhd->devices_cnt; i++)
3316                                 if (drhd->devices[i] != NULL)
3317                                         break;
3318                         /* ignore DMAR unit if no pci devices exist */
3319                         if (i == drhd->devices_cnt)
3320                                 drhd->ignored = 1;
3321                 }
3322         }
3323
3324         for_each_drhd_unit(drhd) {
3325                 int i;
3326                 if (drhd->ignored || drhd->include_all)
3327                         continue;
3328
3329                 for (i = 0; i < drhd->devices_cnt; i++)
3330                         if (drhd->devices[i] &&
3331                             !IS_GFX_DEVICE(drhd->devices[i]))
3332                                 break;
3333
3334                 if (i < drhd->devices_cnt)
3335                         continue;
3336
3337                 /* This IOMMU has *only* gfx devices. Either bypass it or
3338                    set the gfx_mapped flag, as appropriate */
3339                 if (dmar_map_gfx) {
3340                         intel_iommu_gfx_mapped = 1;
3341                 } else {
3342                         drhd->ignored = 1;
3343                         for (i = 0; i < drhd->devices_cnt; i++) {
3344                                 if (!drhd->devices[i])
3345                                         continue;
3346                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3347                         }
3348                 }
3349         }
3350 }
3351
3352 #ifdef CONFIG_SUSPEND
3353 static int init_iommu_hw(void)
3354 {
3355         struct dmar_drhd_unit *drhd;
3356         struct intel_iommu *iommu = NULL;
3357
3358         for_each_active_iommu(iommu, drhd)
3359                 if (iommu->qi)
3360                         dmar_reenable_qi(iommu);
3361
3362         for_each_iommu(iommu, drhd) {
3363                 if (drhd->ignored) {
3364                         /*
3365                          * we always have to disable PMRs or DMA may fail on
3366                          * this device
3367                          */
3368                         if (force_on)
3369                                 iommu_disable_protect_mem_regions(iommu);
3370                         continue;
3371                 }
3372         
3373                 iommu_flush_write_buffer(iommu);
3374
3375                 iommu_set_root_entry(iommu);
3376
3377                 iommu->flush.flush_context(iommu, 0, 0, 0,
3378                                            DMA_CCMD_GLOBAL_INVL);
3379                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3380                                          DMA_TLB_GLOBAL_FLUSH);
3381                 if (iommu_enable_translation(iommu))
3382                         return 1;
3383                 iommu_disable_protect_mem_regions(iommu);
3384         }
3385
3386         return 0;
3387 }
3388
3389 static void iommu_flush_all(void)
3390 {
3391         struct dmar_drhd_unit *drhd;
3392         struct intel_iommu *iommu;
3393
3394         for_each_active_iommu(iommu, drhd) {
3395                 iommu->flush.flush_context(iommu, 0, 0, 0,
3396                                            DMA_CCMD_GLOBAL_INVL);
3397                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3398                                          DMA_TLB_GLOBAL_FLUSH);
3399         }
3400 }
3401
3402 static int iommu_suspend(void)
3403 {
3404         struct dmar_drhd_unit *drhd;
3405         struct intel_iommu *iommu = NULL;
3406         unsigned long flag;
3407
3408         for_each_active_iommu(iommu, drhd) {
3409                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3410                                                  GFP_ATOMIC);
3411                 if (!iommu->iommu_state)
3412                         goto nomem;
3413         }
3414
3415         iommu_flush_all();
3416
3417         for_each_active_iommu(iommu, drhd) {
3418                 iommu_disable_translation(iommu);
3419
3420                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3421
3422                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3423                         readl(iommu->reg + DMAR_FECTL_REG);
3424                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3425                         readl(iommu->reg + DMAR_FEDATA_REG);
3426                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3427                         readl(iommu->reg + DMAR_FEADDR_REG);
3428                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3429                         readl(iommu->reg + DMAR_FEUADDR_REG);
3430
3431                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3432         }
3433         return 0;
3434
3435 nomem:
3436         for_each_active_iommu(iommu, drhd)
3437                 kfree(iommu->iommu_state);
3438
3439         return -ENOMEM;
3440 }
3441
3442 static void iommu_resume(void)
3443 {
3444         struct dmar_drhd_unit *drhd;
3445         struct intel_iommu *iommu = NULL;
3446         unsigned long flag;
3447
3448         if (init_iommu_hw()) {
3449                 if (force_on)
3450                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3451                 else
3452                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3453                 return;
3454         }
3455
3456         for_each_active_iommu(iommu, drhd) {
3457
3458                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3459
3460                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3461                         iommu->reg + DMAR_FECTL_REG);
3462                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3463                         iommu->reg + DMAR_FEDATA_REG);
3464                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3465                         iommu->reg + DMAR_FEADDR_REG);
3466                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3467                         iommu->reg + DMAR_FEUADDR_REG);
3468
3469                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3470         }
3471
3472         for_each_active_iommu(iommu, drhd)
3473                 kfree(iommu->iommu_state);
3474 }
3475
3476 static struct syscore_ops iommu_syscore_ops = {
3477         .resume         = iommu_resume,
3478         .suspend        = iommu_suspend,
3479 };
3480
3481 static void __init init_iommu_pm_ops(void)
3482 {
3483         register_syscore_ops(&iommu_syscore_ops);
3484 }
3485
3486 #else
3487 static inline void init_iommu_pm_ops(void) {}
3488 #endif  /* CONFIG_PM */
3489
3490 LIST_HEAD(dmar_rmrr_units);
3491
3492 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3493 {
3494         list_add(&rmrr->list, &dmar_rmrr_units);
3495 }
3496
3497
3498 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3499 {
3500         struct acpi_dmar_reserved_memory *rmrr;
3501         struct dmar_rmrr_unit *rmrru;
3502
3503         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3504         if (!rmrru)
3505                 return -ENOMEM;
3506
3507         rmrru->hdr = header;
3508         rmrr = (struct acpi_dmar_reserved_memory *)header;
3509         rmrru->base_address = rmrr->base_address;
3510         rmrru->end_address = rmrr->end_address;
3511
3512         dmar_register_rmrr_unit(rmrru);
3513         return 0;
3514 }
3515
3516 static int __init
3517 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3518 {
3519         struct acpi_dmar_reserved_memory *rmrr;
3520         int ret;
3521
3522         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3523         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3524                 ((void *)rmrr) + rmrr->header.length,
3525                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3526
3527         if (ret || (rmrru->devices_cnt == 0)) {
3528                 list_del(&rmrru->list);
3529                 kfree(rmrru);
3530         }
3531         return ret;
3532 }
3533
3534 static LIST_HEAD(dmar_atsr_units);
3535
3536 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3537 {
3538         struct acpi_dmar_atsr *atsr;
3539         struct dmar_atsr_unit *atsru;
3540
3541         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3542         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3543         if (!atsru)
3544                 return -ENOMEM;
3545
3546         atsru->hdr = hdr;
3547         atsru->include_all = atsr->flags & 0x1;
3548
3549         list_add(&atsru->list, &dmar_atsr_units);
3550
3551         return 0;
3552 }
3553
3554 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3555 {
3556         int rc;
3557         struct acpi_dmar_atsr *atsr;
3558
3559         if (atsru->include_all)
3560                 return 0;
3561
3562         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3563         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3564                                 (void *)atsr + atsr->header.length,
3565                                 &atsru->devices_cnt, &atsru->devices,
3566                                 atsr->segment);
3567         if (rc || !atsru->devices_cnt) {
3568                 list_del(&atsru->list);
3569                 kfree(atsru);
3570         }
3571
3572         return rc;
3573 }
3574
3575 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3576 {
3577         int i;
3578         struct pci_bus *bus;
3579         struct acpi_dmar_atsr *atsr;
3580         struct dmar_atsr_unit *atsru;
3581
3582         dev = pci_physfn(dev);
3583
3584         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3585                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3586                 if (atsr->segment == pci_domain_nr(dev->bus))
3587                         goto found;
3588         }
3589
3590         return 0;
3591
3592 found:
3593         for (bus = dev->bus; bus; bus = bus->parent) {
3594                 struct pci_dev *bridge = bus->self;
3595
3596                 if (!bridge || !pci_is_pcie(bridge) ||
3597                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3598                         return 0;
3599
3600                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT) {
3601                         for (i = 0; i < atsru->devices_cnt; i++)
3602                                 if (atsru->devices[i] == bridge)
3603                                         return 1;
3604                         break;
3605                 }
3606         }
3607
3608         if (atsru->include_all)
3609                 return 1;
3610
3611         return 0;
3612 }
3613
3614 int __init dmar_parse_rmrr_atsr_dev(void)
3615 {
3616         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3617         struct dmar_atsr_unit *atsr, *atsr_n;
3618         int ret = 0;
3619
3620         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3621                 ret = rmrr_parse_dev(rmrr);
3622                 if (ret)
3623                         return ret;
3624         }
3625
3626         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3627                 ret = atsr_parse_dev(atsr);
3628                 if (ret)
3629                         return ret;
3630         }
3631
3632         return ret;
3633 }
3634
3635 /*
3636  * Here we only respond to action of unbound device from driver.
3637  *
3638  * Added device is not attached to its DMAR domain here yet. That will happen
3639  * when mapping the device to iova.
3640  */
3641 static int device_notifier(struct notifier_block *nb,
3642                                   unsigned long action, void *data)
3643 {
3644         struct device *dev = data;
3645         struct pci_dev *pdev = to_pci_dev(dev);
3646         struct dmar_domain *domain;
3647
3648         if (iommu_no_mapping(dev))
3649                 return 0;
3650
3651         domain = find_domain(pdev);
3652         if (!domain)
3653                 return 0;
3654
3655         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3656                 domain_remove_one_dev_info(domain, pdev);
3657
3658                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3659                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3660                     list_empty(&domain->devices))
3661                         domain_exit(domain);
3662         }
3663
3664         return 0;
3665 }
3666
3667 static struct notifier_block device_nb = {
3668         .notifier_call = device_notifier,
3669 };
3670
3671 int __init intel_iommu_init(void)
3672 {
3673         int ret = 0;
3674         struct dmar_drhd_unit *drhd;
3675
3676         /* VT-d is required for a TXT/tboot launch, so enforce that */
3677         force_on = tboot_force_iommu();
3678
3679         if (dmar_table_init()) {
3680                 if (force_on)
3681                         panic("tboot: Failed to initialize DMAR table\n");
3682                 return  -ENODEV;
3683         }
3684
3685         /*
3686          * Disable translation if already enabled prior to OS handover.
3687          */
3688         for_each_drhd_unit(drhd) {
3689                 struct intel_iommu *iommu;
3690
3691                 if (drhd->ignored)
3692                         continue;
3693
3694                 iommu = drhd->iommu;
3695                 if (iommu->gcmd & DMA_GCMD_TE)
3696                         iommu_disable_translation(iommu);
3697         }
3698
3699         if (dmar_dev_scope_init() < 0) {
3700                 if (force_on)
3701                         panic("tboot: Failed to initialize DMAR device scope\n");
3702                 return  -ENODEV;
3703         }
3704
3705         if (no_iommu || dmar_disabled)
3706                 return -ENODEV;
3707
3708         if (iommu_init_mempool()) {
3709                 if (force_on)
3710                         panic("tboot: Failed to initialize iommu memory\n");
3711                 return  -ENODEV;
3712         }
3713
3714         if (list_empty(&dmar_rmrr_units))
3715                 printk(KERN_INFO "DMAR: No RMRR found\n");
3716
3717         if (list_empty(&dmar_atsr_units))
3718                 printk(KERN_INFO "DMAR: No ATSR found\n");
3719
3720         if (dmar_init_reserved_ranges()) {
3721                 if (force_on)
3722                         panic("tboot: Failed to reserve iommu ranges\n");
3723                 return  -ENODEV;
3724         }
3725
3726         init_no_remapping_devices();
3727
3728         ret = init_dmars();
3729         if (ret) {
3730                 if (force_on)
3731                         panic("tboot: Failed to initialize DMARs\n");
3732                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3733                 put_iova_domain(&reserved_iova_list);
3734                 iommu_exit_mempool();
3735                 return ret;
3736         }
3737         printk(KERN_INFO
3738         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3739
3740         init_timer(&unmap_timer);
3741 #ifdef CONFIG_SWIOTLB
3742         swiotlb = 0;
3743 #endif
3744         dma_ops = &intel_dma_ops;
3745
3746         init_iommu_pm_ops();
3747
3748         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3749
3750         bus_register_notifier(&pci_bus_type, &device_nb);
3751
3752         intel_iommu_enabled = 1;
3753
3754         return 0;
3755 }
3756
3757 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3758                                            struct pci_dev *pdev)
3759 {
3760         struct pci_dev *tmp, *parent;
3761
3762         if (!iommu || !pdev)
3763                 return;
3764
3765         /* dependent device detach */
3766         tmp = pci_find_upstream_pcie_bridge(pdev);
3767         /* Secondary interface's bus number and devfn 0 */
3768         if (tmp) {
3769                 parent = pdev->bus->self;
3770                 while (parent != tmp) {
3771                         iommu_detach_dev(iommu, parent->bus->number,
3772                                          parent->devfn);
3773                         parent = parent->bus->self;
3774                 }
3775                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3776                         iommu_detach_dev(iommu,
3777                                 tmp->subordinate->number, 0);
3778                 else /* this is a legacy PCI bridge */
3779                         iommu_detach_dev(iommu, tmp->bus->number,
3780                                          tmp->devfn);
3781         }
3782 }
3783
3784 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3785                                           struct pci_dev *pdev)
3786 {
3787         struct device_domain_info *info, *tmp;
3788         struct intel_iommu *iommu;
3789         unsigned long flags;
3790         int found = 0;
3791
3792         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3793                                 pdev->devfn);
3794         if (!iommu)
3795                 return;
3796
3797         spin_lock_irqsave(&device_domain_lock, flags);
3798         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
3799                 if (info->segment == pci_domain_nr(pdev->bus) &&
3800                     info->bus == pdev->bus->number &&
3801                     info->devfn == pdev->devfn) {
3802                         unlink_domain_info(info);
3803                         spin_unlock_irqrestore(&device_domain_lock, flags);
3804
3805                         iommu_disable_dev_iotlb(info);
3806                         iommu_detach_dev(iommu, info->bus, info->devfn);
3807                         iommu_detach_dependent_devices(iommu, pdev);
3808                         free_devinfo_mem(info);
3809
3810                         spin_lock_irqsave(&device_domain_lock, flags);
3811
3812                         if (found)
3813                                 break;
3814                         else
3815                                 continue;
3816                 }
3817
3818                 /* if there is no other devices under the same iommu
3819                  * owned by this domain, clear this iommu in iommu_bmp
3820                  * update iommu count and coherency
3821                  */
3822                 if (iommu == device_to_iommu(info->segment, info->bus,
3823                                             info->devfn))
3824                         found = 1;
3825         }
3826
3827         spin_unlock_irqrestore(&device_domain_lock, flags);
3828
3829         if (found == 0) {
3830                 unsigned long tmp_flags;
3831                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3832                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3833                 domain->iommu_count--;
3834                 domain_update_iommu_cap(domain);
3835                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3836
3837                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3838                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3839                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3840                         clear_bit(domain->id, iommu->domain_ids);
3841                         iommu->domains[domain->id] = NULL;
3842                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3843                 }
3844         }
3845 }
3846
3847 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3848 {
3849         struct device_domain_info *info;
3850         struct intel_iommu *iommu;
3851         unsigned long flags1, flags2;
3852
3853         spin_lock_irqsave(&device_domain_lock, flags1);
3854         while (!list_empty(&domain->devices)) {
3855                 info = list_entry(domain->devices.next,
3856                         struct device_domain_info, link);
3857                 unlink_domain_info(info);
3858                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3859
3860                 iommu_disable_dev_iotlb(info);
3861                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3862                 iommu_detach_dev(iommu, info->bus, info->devfn);
3863                 iommu_detach_dependent_devices(iommu, info->dev);
3864
3865                 /* clear this iommu in iommu_bmp, update iommu count
3866                  * and capabilities
3867                  */
3868                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3869                 if (test_and_clear_bit(iommu->seq_id,
3870                                        domain->iommu_bmp)) {
3871                         domain->iommu_count--;
3872                         domain_update_iommu_cap(domain);
3873                 }
3874                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3875
3876                 free_devinfo_mem(info);
3877                 spin_lock_irqsave(&device_domain_lock, flags1);
3878         }
3879         spin_unlock_irqrestore(&device_domain_lock, flags1);
3880 }
3881
3882 /* domain id for virtual machine, it won't be set in context */
3883 static atomic_t vm_domid = ATOMIC_INIT(0);
3884
3885 static struct dmar_domain *iommu_alloc_vm_domain(void)
3886 {
3887         struct dmar_domain *domain;
3888
3889         domain = alloc_domain_mem();
3890         if (!domain)
3891                 return NULL;
3892
3893         domain->id = atomic_inc_return(&vm_domid);
3894         domain->nid = -1;
3895         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3896         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3897
3898         return domain;
3899 }
3900
3901 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3902 {
3903         int adjust_width;
3904
3905         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3906         spin_lock_init(&domain->iommu_lock);
3907
3908         domain_reserve_special_ranges(domain);
3909
3910         /* calculate AGAW */
3911         domain->gaw = guest_width;
3912         adjust_width = guestwidth_to_adjustwidth(guest_width);
3913         domain->agaw = width_to_agaw(adjust_width);
3914
3915         INIT_LIST_HEAD(&domain->devices);
3916
3917         domain->iommu_count = 0;
3918         domain->iommu_coherency = 0;
3919         domain->iommu_snooping = 0;
3920         domain->iommu_superpage = 0;
3921         domain->max_addr = 0;
3922         domain->nid = -1;
3923
3924         /* always allocate the top pgd */
3925         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3926         if (!domain->pgd)
3927                 return -ENOMEM;
3928         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3929         return 0;
3930 }
3931
3932 static void iommu_free_vm_domain(struct dmar_domain *domain)
3933 {
3934         unsigned long flags;
3935         struct dmar_drhd_unit *drhd;
3936         struct intel_iommu *iommu;
3937         unsigned long i;
3938         unsigned long ndomains;
3939
3940         for_each_drhd_unit(drhd) {
3941                 if (drhd->ignored)
3942                         continue;
3943                 iommu = drhd->iommu;
3944
3945                 ndomains = cap_ndoms(iommu->cap);
3946                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3947                         if (iommu->domains[i] == domain) {
3948                                 spin_lock_irqsave(&iommu->lock, flags);
3949                                 clear_bit(i, iommu->domain_ids);
3950                                 iommu->domains[i] = NULL;
3951                                 spin_unlock_irqrestore(&iommu->lock, flags);
3952                                 break;
3953                         }
3954                 }
3955         }
3956 }
3957
3958 static void vm_domain_exit(struct dmar_domain *domain)
3959 {
3960         /* Domain 0 is reserved, so dont process it */
3961         if (!domain)
3962                 return;
3963
3964         vm_domain_remove_all_dev_info(domain);
3965         /* destroy iovas */
3966         put_iova_domain(&domain->iovad);
3967
3968         /* clear ptes */
3969         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3970
3971         /* free page tables */
3972         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3973
3974         iommu_free_vm_domain(domain);
3975         free_domain_mem(domain);
3976 }
3977
3978 static int intel_iommu_domain_init(struct iommu_domain *domain)
3979 {
3980         struct dmar_domain *dmar_domain;
3981
3982         dmar_domain = iommu_alloc_vm_domain();
3983         if (!dmar_domain) {
3984                 printk(KERN_ERR
3985                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3986                 return -ENOMEM;
3987         }
3988         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3989                 printk(KERN_ERR
3990                         "intel_iommu_domain_init() failed\n");
3991                 vm_domain_exit(dmar_domain);
3992                 return -ENOMEM;
3993         }
3994         domain_update_iommu_cap(dmar_domain);
3995         domain->priv = dmar_domain;
3996
3997         domain->geometry.aperture_start = 0;
3998         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3999         domain->geometry.force_aperture = true;
4000
4001         return 0;
4002 }
4003
4004 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
4005 {
4006         struct dmar_domain *dmar_domain = domain->priv;
4007
4008         domain->priv = NULL;
4009         vm_domain_exit(dmar_domain);
4010 }
4011
4012 static int intel_iommu_attach_device(struct iommu_domain *domain,
4013                                      struct device *dev)
4014 {
4015         struct dmar_domain *dmar_domain = domain->priv;
4016         struct pci_dev *pdev = to_pci_dev(dev);
4017         struct intel_iommu *iommu;
4018         int addr_width;
4019
4020         /* normally pdev is not mapped */
4021         if (unlikely(domain_context_mapped(pdev))) {
4022                 struct dmar_domain *old_domain;
4023
4024                 old_domain = find_domain(pdev);
4025                 if (old_domain) {
4026                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4027                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4028                                 domain_remove_one_dev_info(old_domain, pdev);
4029                         else
4030                                 domain_remove_dev_info(old_domain);
4031                 }
4032         }
4033
4034         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4035                                 pdev->devfn);
4036         if (!iommu)
4037                 return -ENODEV;
4038
4039         /* check if this iommu agaw is sufficient for max mapped address */
4040         addr_width = agaw_to_width(iommu->agaw);
4041         if (addr_width > cap_mgaw(iommu->cap))
4042                 addr_width = cap_mgaw(iommu->cap);
4043
4044         if (dmar_domain->max_addr > (1LL << addr_width)) {
4045                 printk(KERN_ERR "%s: iommu width (%d) is not "
4046                        "sufficient for the mapped address (%llx)\n",
4047                        __func__, addr_width, dmar_domain->max_addr);
4048                 return -EFAULT;
4049         }
4050         dmar_domain->gaw = addr_width;
4051
4052         /*
4053          * Knock out extra levels of page tables if necessary
4054          */
4055         while (iommu->agaw < dmar_domain->agaw) {
4056                 struct dma_pte *pte;
4057
4058                 pte = dmar_domain->pgd;
4059                 if (dma_pte_present(pte)) {
4060                         dmar_domain->pgd = (struct dma_pte *)
4061                                 phys_to_virt(dma_pte_addr(pte));
4062                         free_pgtable_page(pte);
4063                 }
4064                 dmar_domain->agaw--;
4065         }
4066
4067         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4068 }
4069
4070 static void intel_iommu_detach_device(struct iommu_domain *domain,
4071                                       struct device *dev)
4072 {
4073         struct dmar_domain *dmar_domain = domain->priv;
4074         struct pci_dev *pdev = to_pci_dev(dev);
4075
4076         domain_remove_one_dev_info(dmar_domain, pdev);
4077 }
4078
4079 static int intel_iommu_map(struct iommu_domain *domain,
4080                            unsigned long iova, phys_addr_t hpa,
4081                            size_t size, int iommu_prot)
4082 {
4083         struct dmar_domain *dmar_domain = domain->priv;
4084         u64 max_addr;
4085         int prot = 0;
4086         int ret;
4087
4088         if (iommu_prot & IOMMU_READ)
4089                 prot |= DMA_PTE_READ;
4090         if (iommu_prot & IOMMU_WRITE)
4091                 prot |= DMA_PTE_WRITE;
4092         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4093                 prot |= DMA_PTE_SNP;
4094
4095         max_addr = iova + size;
4096         if (dmar_domain->max_addr < max_addr) {
4097                 u64 end;
4098
4099                 /* check if minimum agaw is sufficient for mapped address */
4100                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4101                 if (end < max_addr) {
4102                         printk(KERN_ERR "%s: iommu width (%d) is not "
4103                                "sufficient for the mapped address (%llx)\n",
4104                                __func__, dmar_domain->gaw, max_addr);
4105                         return -EFAULT;
4106                 }
4107                 dmar_domain->max_addr = max_addr;
4108         }
4109         /* Round up size to next multiple of PAGE_SIZE, if it and
4110            the low bits of hpa would take us onto the next page */
4111         size = aligned_nrpages(hpa, size);
4112         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4113                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4114         return ret;
4115 }
4116
4117 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4118                              unsigned long iova, size_t size)
4119 {
4120         struct dmar_domain *dmar_domain = domain->priv;
4121         int order;
4122
4123         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4124                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4125
4126         if (dmar_domain->max_addr == iova + size)
4127                 dmar_domain->max_addr = iova;
4128
4129         return PAGE_SIZE << order;
4130 }
4131
4132 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4133                                             dma_addr_t iova)
4134 {
4135         struct dmar_domain *dmar_domain = domain->priv;
4136         struct dma_pte *pte;
4137         u64 phys = 0;
4138
4139         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4140         if (pte)
4141                 phys = dma_pte_addr(pte);
4142
4143         return phys;
4144 }
4145
4146 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4147                                       unsigned long cap)
4148 {
4149         struct dmar_domain *dmar_domain = domain->priv;
4150
4151         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4152                 return dmar_domain->iommu_snooping;
4153         if (cap == IOMMU_CAP_INTR_REMAP)
4154                 return irq_remapping_enabled;
4155
4156         return 0;
4157 }
4158
4159 #define REQ_ACS_FLAGS   (PCI_ACS_SV | PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_UF)
4160
4161 static int intel_iommu_add_device(struct device *dev)
4162 {
4163         struct pci_dev *pdev = to_pci_dev(dev);
4164         struct pci_dev *bridge, *dma_pdev = NULL;
4165         struct iommu_group *group;
4166         int ret;
4167
4168         if (!device_to_iommu(pci_domain_nr(pdev->bus),
4169                              pdev->bus->number, pdev->devfn))
4170                 return -ENODEV;
4171
4172         bridge = pci_find_upstream_pcie_bridge(pdev);
4173         if (bridge) {
4174                 if (pci_is_pcie(bridge))
4175                         dma_pdev = pci_get_domain_bus_and_slot(
4176                                                 pci_domain_nr(pdev->bus),
4177                                                 bridge->subordinate->number, 0);
4178                 if (!dma_pdev)
4179                         dma_pdev = pci_dev_get(bridge);
4180         } else
4181                 dma_pdev = pci_dev_get(pdev);
4182
4183         /* Account for quirked devices */
4184         swap_pci_ref(&dma_pdev, pci_get_dma_source(dma_pdev));
4185
4186         /*
4187          * If it's a multifunction device that does not support our
4188          * required ACS flags, add to the same group as lowest numbered
4189          * function that also does not suport the required ACS flags.
4190          */
4191         if (dma_pdev->multifunction &&
4192             !pci_acs_enabled(dma_pdev, REQ_ACS_FLAGS)) {
4193                 u8 i, slot = PCI_SLOT(dma_pdev->devfn);
4194
4195                 for (i = 0; i < 8; i++) {
4196                         struct pci_dev *tmp;
4197
4198                         tmp = pci_get_slot(dma_pdev->bus, PCI_DEVFN(slot, i));
4199                         if (!tmp)
4200                                 continue;
4201
4202                         if (!pci_acs_enabled(tmp, REQ_ACS_FLAGS)) {
4203                                 swap_pci_ref(&dma_pdev, tmp);
4204                                 break;
4205                         }
4206                         pci_dev_put(tmp);
4207                 }
4208         }
4209
4210         /*
4211          * Devices on the root bus go through the iommu.  If that's not us,
4212          * find the next upstream device and test ACS up to the root bus.
4213          * Finding the next device may require skipping virtual buses.
4214          */
4215         while (!pci_is_root_bus(dma_pdev->bus)) {
4216                 struct pci_bus *bus = dma_pdev->bus;
4217
4218                 while (!bus->self) {
4219                         if (!pci_is_root_bus(bus))
4220                                 bus = bus->parent;
4221                         else
4222                                 goto root_bus;
4223                 }
4224
4225                 if (pci_acs_path_enabled(bus->self, NULL, REQ_ACS_FLAGS))
4226                         break;
4227
4228                 swap_pci_ref(&dma_pdev, pci_dev_get(bus->self));
4229         }
4230
4231 root_bus:
4232         group = iommu_group_get(&dma_pdev->dev);
4233         pci_dev_put(dma_pdev);
4234         if (!group) {
4235                 group = iommu_group_alloc();
4236                 if (IS_ERR(group))
4237                         return PTR_ERR(group);
4238         }
4239
4240         ret = iommu_group_add_device(group, dev);
4241
4242         iommu_group_put(group);
4243         return ret;
4244 }
4245
4246 static void intel_iommu_remove_device(struct device *dev)
4247 {
4248         iommu_group_remove_device(dev);
4249 }
4250
4251 static struct iommu_ops intel_iommu_ops = {
4252         .domain_init    = intel_iommu_domain_init,
4253         .domain_destroy = intel_iommu_domain_destroy,
4254         .attach_dev     = intel_iommu_attach_device,
4255         .detach_dev     = intel_iommu_detach_device,
4256         .map            = intel_iommu_map,
4257         .unmap          = intel_iommu_unmap,
4258         .iova_to_phys   = intel_iommu_iova_to_phys,
4259         .domain_has_cap = intel_iommu_domain_has_cap,
4260         .add_device     = intel_iommu_add_device,
4261         .remove_device  = intel_iommu_remove_device,
4262         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4263 };
4264
4265 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4266 {
4267         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4268         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4269         dmar_map_gfx = 0;
4270 }
4271
4272 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4273 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4274 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4275 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4276 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4277 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4278 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4279
4280 static void quirk_iommu_rwbf(struct pci_dev *dev)
4281 {
4282         /*
4283          * Mobile 4 Series Chipset neglects to set RWBF capability,
4284          * but needs it. Same seems to hold for the desktop versions.
4285          */
4286         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4287         rwbf_quirk = 1;
4288 }
4289
4290 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4291 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4292 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4293 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4294 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4295 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4296 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4297
4298 #define GGC 0x52
4299 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4300 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4301 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4302 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4303 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4304 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4305 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4306 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4307
4308 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4309 {
4310         unsigned short ggc;
4311
4312         if (pci_read_config_word(dev, GGC, &ggc))
4313                 return;
4314
4315         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4316                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4317                 dmar_map_gfx = 0;
4318         } else if (dmar_map_gfx) {
4319                 /* we have to ensure the gfx device is idle before we flush */
4320                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4321                 intel_iommu_strict = 1;
4322        }
4323 }
4324 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4325 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4326 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4327 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4328
4329 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4330    ISOCH DMAR unit for the Azalia sound device, but not give it any
4331    TLB entries, which causes it to deadlock. Check for that.  We do
4332    this in a function called from init_dmars(), instead of in a PCI
4333    quirk, because we don't want to print the obnoxious "BIOS broken"
4334    message if VT-d is actually disabled.
4335 */
4336 static void __init check_tylersburg_isoch(void)
4337 {
4338         struct pci_dev *pdev;
4339         uint32_t vtisochctrl;
4340
4341         /* If there's no Azalia in the system anyway, forget it. */
4342         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4343         if (!pdev)
4344                 return;
4345         pci_dev_put(pdev);
4346
4347         /* System Management Registers. Might be hidden, in which case
4348            we can't do the sanity check. But that's OK, because the
4349            known-broken BIOSes _don't_ actually hide it, so far. */
4350         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4351         if (!pdev)
4352                 return;
4353
4354         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4355                 pci_dev_put(pdev);
4356                 return;
4357         }
4358
4359         pci_dev_put(pdev);
4360
4361         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4362         if (vtisochctrl & 1)
4363                 return;
4364
4365         /* Drop all bits other than the number of TLB entries */
4366         vtisochctrl &= 0x1c;
4367
4368         /* If we have the recommended number of TLB entries (16), fine. */
4369         if (vtisochctrl == 0x10)
4370                 return;
4371
4372         /* Zero TLB entries? You get to ride the short bus to school. */
4373         if (!vtisochctrl) {
4374                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4375                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4376                      dmi_get_system_info(DMI_BIOS_VENDOR),
4377                      dmi_get_system_info(DMI_BIOS_VERSION),
4378                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4379                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4380                 return;
4381         }
4382         
4383         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4384                vtisochctrl);
4385 }