]> Pileus Git - ~andy/linux/blob - drivers/iommu/intel-iommu.c
Merge branch 'for-arm' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal...
[~andy/linux] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef  CONFIG_X86
360 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
361 #else
362 # define        IOMMU_UNITS_SUPPORTED   64
363 #endif
364
365 struct dmar_domain {
366         int     id;                     /* domain id */
367         int     nid;                    /* node id */
368         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                                         /* bitmap of iommus this domain uses*/
370
371         struct list_head devices;       /* all devices' list */
372         struct iova_domain iovad;       /* iova's that belong to this domain */
373
374         struct dma_pte  *pgd;           /* virtual address */
375         int             gaw;            /* max guest address width */
376
377         /* adjusted guest address width, 0 is level 2 30-bit */
378         int             agaw;
379
380         int             flags;          /* flags to find out type of domain */
381
382         int             iommu_coherency;/* indicate coherency of iommu access */
383         int             iommu_snooping; /* indicate snooping control feature*/
384         int             iommu_count;    /* reference count of iommu */
385         int             iommu_superpage;/* Level of superpages supported:
386                                            0 == 4KiB (no superpages), 1 == 2MiB,
387                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388         spinlock_t      iommu_lock;     /* protect iommu set in domain */
389         u64             max_addr;       /* maximum mapped address */
390 };
391
392 /* PCI domain-device relationship */
393 struct device_domain_info {
394         struct list_head link;  /* link to domain siblings */
395         struct list_head global; /* link to global list */
396         int segment;            /* PCI domain */
397         u8 bus;                 /* PCI bus number */
398         u8 devfn;               /* PCI devfn number */
399         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400         struct intel_iommu *iommu; /* IOMMU used by this device */
401         struct dmar_domain *domain; /* pointer to domain */
402 };
403
404 static void flush_unmaps_timeout(unsigned long data);
405
406 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
407
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410         int next;
411         struct iova *iova[HIGH_WATER_MARK];
412         struct dmar_domain *domain[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434 int intel_iommu_enabled = 0;
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 static struct iommu_ops intel_iommu_ops;
450
451 static int __init intel_iommu_setup(char *str)
452 {
453         if (!str)
454                 return -EINVAL;
455         while (*str) {
456                 if (!strncmp(str, "on", 2)) {
457                         dmar_disabled = 0;
458                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
459                 } else if (!strncmp(str, "off", 3)) {
460                         dmar_disabled = 1;
461                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable GFX device mapping\n");
466                 } else if (!strncmp(str, "forcedac", 8)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
469                         dmar_forcedac = 1;
470                 } else if (!strncmp(str, "strict", 6)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: disable batched IOTLB flush\n");
473                         intel_iommu_strict = 1;
474                 } else if (!strncmp(str, "sp_off", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable supported super page\n");
477                         intel_iommu_superpage = 0;
478                 }
479
480                 str += strcspn(str, ",");
481                 while (*str == ',')
482                         str++;
483         }
484         return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491
492 static inline void *alloc_pgtable_page(int node)
493 {
494         struct page *page;
495         void *vaddr = NULL;
496
497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498         if (page)
499                 vaddr = page_address(page);
500         return vaddr;
501 }
502
503 static inline void free_pgtable_page(void *vaddr)
504 {
505         free_page((unsigned long)vaddr);
506 }
507
508 static inline void *alloc_domain_mem(void)
509 {
510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512
513 static void free_domain_mem(void *vaddr)
514 {
515         kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517
518 static inline void * alloc_devinfo_mem(void)
519 {
520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527
528 struct iova *alloc_iova_mem(void)
529 {
530         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532
533 void free_iova_mem(struct iova *iova)
534 {
535         kmem_cache_free(iommu_iova_cache, iova);
536 }
537
538
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541         unsigned long sagaw;
542         int agaw = -1;
543
544         sagaw = cap_sagaw(iommu->cap);
545         for (agaw = width_to_agaw(max_gaw);
546              agaw >= 0; agaw--) {
547                 if (test_bit(agaw, &sagaw))
548                         break;
549         }
550
551         return agaw;
552 }
553
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
557 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 {
559         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
567 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 {
569         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575         int iommu_id;
576
577         /* si_domain and vm domain should not get here. */
578         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583                 return NULL;
584
585         return g_iommus[iommu_id];
586 }
587
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590         int i;
591
592         domain->iommu_coherency = 1;
593
594         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
595                 if (!ecap_coherent(g_iommus[i]->ecap)) {
596                         domain->iommu_coherency = 0;
597                         break;
598                 }
599         }
600 }
601
602 static void domain_update_iommu_snooping(struct dmar_domain *domain)
603 {
604         int i;
605
606         domain->iommu_snooping = 1;
607
608         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
609                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
610                         domain->iommu_snooping = 0;
611                         break;
612                 }
613         }
614 }
615
616 static void domain_update_iommu_superpage(struct dmar_domain *domain)
617 {
618         struct dmar_drhd_unit *drhd;
619         struct intel_iommu *iommu = NULL;
620         int mask = 0xf;
621
622         if (!intel_iommu_superpage) {
623                 domain->iommu_superpage = 0;
624                 return;
625         }
626
627         /* set iommu_superpage to the smallest common denominator */
628         for_each_active_iommu(iommu, drhd) {
629                 mask &= cap_super_page_val(iommu->cap);
630                 if (!mask) {
631                         break;
632                 }
633         }
634         domain->iommu_superpage = fls(mask);
635 }
636
637 /* Some capabilities may be different across iommus */
638 static void domain_update_iommu_cap(struct dmar_domain *domain)
639 {
640         domain_update_iommu_coherency(domain);
641         domain_update_iommu_snooping(domain);
642         domain_update_iommu_superpage(domain);
643 }
644
645 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
646 {
647         struct dmar_drhd_unit *drhd = NULL;
648         int i;
649
650         for_each_drhd_unit(drhd) {
651                 if (drhd->ignored)
652                         continue;
653                 if (segment != drhd->segment)
654                         continue;
655
656                 for (i = 0; i < drhd->devices_cnt; i++) {
657                         if (drhd->devices[i] &&
658                             drhd->devices[i]->bus->number == bus &&
659                             drhd->devices[i]->devfn == devfn)
660                                 return drhd->iommu;
661                         if (drhd->devices[i] &&
662                             drhd->devices[i]->subordinate &&
663                             drhd->devices[i]->subordinate->number <= bus &&
664                             drhd->devices[i]->subordinate->subordinate >= bus)
665                                 return drhd->iommu;
666                 }
667
668                 if (drhd->include_all)
669                         return drhd->iommu;
670         }
671
672         return NULL;
673 }
674
675 static void domain_flush_cache(struct dmar_domain *domain,
676                                void *addr, int size)
677 {
678         if (!domain->iommu_coherency)
679                 clflush_cache_range(addr, size);
680 }
681
682 /* Gets context entry for a given bus and devfn */
683 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
684                 u8 bus, u8 devfn)
685 {
686         struct root_entry *root;
687         struct context_entry *context;
688         unsigned long phy_addr;
689         unsigned long flags;
690
691         spin_lock_irqsave(&iommu->lock, flags);
692         root = &iommu->root_entry[bus];
693         context = get_context_addr_from_root(root);
694         if (!context) {
695                 context = (struct context_entry *)
696                                 alloc_pgtable_page(iommu->node);
697                 if (!context) {
698                         spin_unlock_irqrestore(&iommu->lock, flags);
699                         return NULL;
700                 }
701                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
702                 phy_addr = virt_to_phys((void *)context);
703                 set_root_value(root, phy_addr);
704                 set_root_present(root);
705                 __iommu_flush_cache(iommu, root, sizeof(*root));
706         }
707         spin_unlock_irqrestore(&iommu->lock, flags);
708         return &context[devfn];
709 }
710
711 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
712 {
713         struct root_entry *root;
714         struct context_entry *context;
715         int ret;
716         unsigned long flags;
717
718         spin_lock_irqsave(&iommu->lock, flags);
719         root = &iommu->root_entry[bus];
720         context = get_context_addr_from_root(root);
721         if (!context) {
722                 ret = 0;
723                 goto out;
724         }
725         ret = context_present(&context[devfn]);
726 out:
727         spin_unlock_irqrestore(&iommu->lock, flags);
728         return ret;
729 }
730
731 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
732 {
733         struct root_entry *root;
734         struct context_entry *context;
735         unsigned long flags;
736
737         spin_lock_irqsave(&iommu->lock, flags);
738         root = &iommu->root_entry[bus];
739         context = get_context_addr_from_root(root);
740         if (context) {
741                 context_clear_entry(&context[devfn]);
742                 __iommu_flush_cache(iommu, &context[devfn], \
743                         sizeof(*context));
744         }
745         spin_unlock_irqrestore(&iommu->lock, flags);
746 }
747
748 static void free_context_table(struct intel_iommu *iommu)
749 {
750         struct root_entry *root;
751         int i;
752         unsigned long flags;
753         struct context_entry *context;
754
755         spin_lock_irqsave(&iommu->lock, flags);
756         if (!iommu->root_entry) {
757                 goto out;
758         }
759         for (i = 0; i < ROOT_ENTRY_NR; i++) {
760                 root = &iommu->root_entry[i];
761                 context = get_context_addr_from_root(root);
762                 if (context)
763                         free_pgtable_page(context);
764         }
765         free_pgtable_page(iommu->root_entry);
766         iommu->root_entry = NULL;
767 out:
768         spin_unlock_irqrestore(&iommu->lock, flags);
769 }
770
771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772                                       unsigned long pfn, int target_level)
773 {
774         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
775         struct dma_pte *parent, *pte = NULL;
776         int level = agaw_to_level(domain->agaw);
777         int offset;
778
779         BUG_ON(!domain->pgd);
780         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
781         parent = domain->pgd;
782
783         while (level > 0) {
784                 void *tmp_page;
785
786                 offset = pfn_level_offset(pfn, level);
787                 pte = &parent[offset];
788                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
789                         break;
790                 if (level == target_level)
791                         break;
792
793                 if (!dma_pte_present(pte)) {
794                         uint64_t pteval;
795
796                         tmp_page = alloc_pgtable_page(domain->nid);
797
798                         if (!tmp_page)
799                                 return NULL;
800
801                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
802                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
803                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
804                                 /* Someone else set it while we were thinking; use theirs. */
805                                 free_pgtable_page(tmp_page);
806                         } else {
807                                 dma_pte_addr(pte);
808                                 domain_flush_cache(domain, pte, sizeof(*pte));
809                         }
810                 }
811                 parent = phys_to_virt(dma_pte_addr(pte));
812                 level--;
813         }
814
815         return pte;
816 }
817
818
819 /* return address's pte at specific level */
820 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
821                                          unsigned long pfn,
822                                          int level, int *large_page)
823 {
824         struct dma_pte *parent, *pte = NULL;
825         int total = agaw_to_level(domain->agaw);
826         int offset;
827
828         parent = domain->pgd;
829         while (level <= total) {
830                 offset = pfn_level_offset(pfn, total);
831                 pte = &parent[offset];
832                 if (level == total)
833                         return pte;
834
835                 if (!dma_pte_present(pte)) {
836                         *large_page = total;
837                         break;
838                 }
839
840                 if (pte->val & DMA_PTE_LARGE_PAGE) {
841                         *large_page = total;
842                         return pte;
843                 }
844
845                 parent = phys_to_virt(dma_pte_addr(pte));
846                 total--;
847         }
848         return NULL;
849 }
850
851 /* clear last level pte, a tlb flush should be followed */
852 static int dma_pte_clear_range(struct dmar_domain *domain,
853                                 unsigned long start_pfn,
854                                 unsigned long last_pfn)
855 {
856         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
857         unsigned int large_page = 1;
858         struct dma_pte *first_pte, *pte;
859         int order;
860
861         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
862         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
863         BUG_ON(start_pfn > last_pfn);
864
865         /* we don't need lock here; nobody else touches the iova range */
866         do {
867                 large_page = 1;
868                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
869                 if (!pte) {
870                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
871                         continue;
872                 }
873                 do {
874                         dma_clear_pte(pte);
875                         start_pfn += lvl_to_nr_pages(large_page);
876                         pte++;
877                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
878
879                 domain_flush_cache(domain, first_pte,
880                                    (void *)pte - (void *)first_pte);
881
882         } while (start_pfn && start_pfn <= last_pfn);
883
884         order = (large_page - 1) * 9;
885         return order;
886 }
887
888 /* free page table pages. last level pte should already be cleared */
889 static void dma_pte_free_pagetable(struct dmar_domain *domain,
890                                    unsigned long start_pfn,
891                                    unsigned long last_pfn)
892 {
893         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
894         struct dma_pte *first_pte, *pte;
895         int total = agaw_to_level(domain->agaw);
896         int level;
897         unsigned long tmp;
898         int large_page = 2;
899
900         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
901         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
902         BUG_ON(start_pfn > last_pfn);
903
904         /* We don't need lock here; nobody else touches the iova range */
905         level = 2;
906         while (level <= total) {
907                 tmp = align_to_level(start_pfn, level);
908
909                 /* If we can't even clear one PTE at this level, we're done */
910                 if (tmp + level_size(level) - 1 > last_pfn)
911                         return;
912
913                 do {
914                         large_page = level;
915                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
916                         if (large_page > level)
917                                 level = large_page + 1;
918                         if (!pte) {
919                                 tmp = align_to_level(tmp + 1, level + 1);
920                                 continue;
921                         }
922                         do {
923                                 if (dma_pte_present(pte)) {
924                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
925                                         dma_clear_pte(pte);
926                                 }
927                                 pte++;
928                                 tmp += level_size(level);
929                         } while (!first_pte_in_page(pte) &&
930                                  tmp + level_size(level) - 1 <= last_pfn);
931
932                         domain_flush_cache(domain, first_pte,
933                                            (void *)pte - (void *)first_pte);
934                         
935                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
936                 level++;
937         }
938         /* free pgd */
939         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
940                 free_pgtable_page(domain->pgd);
941                 domain->pgd = NULL;
942         }
943 }
944
945 /* iommu handling */
946 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
947 {
948         struct root_entry *root;
949         unsigned long flags;
950
951         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
952         if (!root)
953                 return -ENOMEM;
954
955         __iommu_flush_cache(iommu, root, ROOT_SIZE);
956
957         spin_lock_irqsave(&iommu->lock, flags);
958         iommu->root_entry = root;
959         spin_unlock_irqrestore(&iommu->lock, flags);
960
961         return 0;
962 }
963
964 static void iommu_set_root_entry(struct intel_iommu *iommu)
965 {
966         void *addr;
967         u32 sts;
968         unsigned long flag;
969
970         addr = iommu->root_entry;
971
972         raw_spin_lock_irqsave(&iommu->register_lock, flag);
973         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
974
975         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
976
977         /* Make sure hardware complete it */
978         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
979                       readl, (sts & DMA_GSTS_RTPS), sts);
980
981         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
982 }
983
984 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
985 {
986         u32 val;
987         unsigned long flag;
988
989         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
990                 return;
991
992         raw_spin_lock_irqsave(&iommu->register_lock, flag);
993         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
994
995         /* Make sure hardware complete it */
996         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
997                       readl, (!(val & DMA_GSTS_WBFS)), val);
998
999         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1000 }
1001
1002 /* return value determine if we need a write buffer flush */
1003 static void __iommu_flush_context(struct intel_iommu *iommu,
1004                                   u16 did, u16 source_id, u8 function_mask,
1005                                   u64 type)
1006 {
1007         u64 val = 0;
1008         unsigned long flag;
1009
1010         switch (type) {
1011         case DMA_CCMD_GLOBAL_INVL:
1012                 val = DMA_CCMD_GLOBAL_INVL;
1013                 break;
1014         case DMA_CCMD_DOMAIN_INVL:
1015                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1016                 break;
1017         case DMA_CCMD_DEVICE_INVL:
1018                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1019                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1020                 break;
1021         default:
1022                 BUG();
1023         }
1024         val |= DMA_CCMD_ICC;
1025
1026         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1027         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1028
1029         /* Make sure hardware complete it */
1030         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1031                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1032
1033         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1034 }
1035
1036 /* return value determine if we need a write buffer flush */
1037 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1038                                 u64 addr, unsigned int size_order, u64 type)
1039 {
1040         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1041         u64 val = 0, val_iva = 0;
1042         unsigned long flag;
1043
1044         switch (type) {
1045         case DMA_TLB_GLOBAL_FLUSH:
1046                 /* global flush doesn't need set IVA_REG */
1047                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1048                 break;
1049         case DMA_TLB_DSI_FLUSH:
1050                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1051                 break;
1052         case DMA_TLB_PSI_FLUSH:
1053                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054                 /* Note: always flush non-leaf currently */
1055                 val_iva = size_order | addr;
1056                 break;
1057         default:
1058                 BUG();
1059         }
1060         /* Note: set drain read/write */
1061 #if 0
1062         /*
1063          * This is probably to be super secure.. Looks like we can
1064          * ignore it without any impact.
1065          */
1066         if (cap_read_drain(iommu->cap))
1067                 val |= DMA_TLB_READ_DRAIN;
1068 #endif
1069         if (cap_write_drain(iommu->cap))
1070                 val |= DMA_TLB_WRITE_DRAIN;
1071
1072         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073         /* Note: Only uses first TLB reg currently */
1074         if (val_iva)
1075                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1076         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1077
1078         /* Make sure hardware complete it */
1079         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1080                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1081
1082         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1083
1084         /* check IOTLB invalidation granularity */
1085         if (DMA_TLB_IAIG(val) == 0)
1086                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1087         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1088                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1089                         (unsigned long long)DMA_TLB_IIRG(type),
1090                         (unsigned long long)DMA_TLB_IAIG(val));
1091 }
1092
1093 static struct device_domain_info *iommu_support_dev_iotlb(
1094         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1095 {
1096         int found = 0;
1097         unsigned long flags;
1098         struct device_domain_info *info;
1099         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1100
1101         if (!ecap_dev_iotlb_support(iommu->ecap))
1102                 return NULL;
1103
1104         if (!iommu->qi)
1105                 return NULL;
1106
1107         spin_lock_irqsave(&device_domain_lock, flags);
1108         list_for_each_entry(info, &domain->devices, link)
1109                 if (info->bus == bus && info->devfn == devfn) {
1110                         found = 1;
1111                         break;
1112                 }
1113         spin_unlock_irqrestore(&device_domain_lock, flags);
1114
1115         if (!found || !info->dev)
1116                 return NULL;
1117
1118         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1119                 return NULL;
1120
1121         if (!dmar_find_matched_atsr_unit(info->dev))
1122                 return NULL;
1123
1124         info->iommu = iommu;
1125
1126         return info;
1127 }
1128
1129 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1130 {
1131         if (!info)
1132                 return;
1133
1134         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1135 }
1136
1137 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1138 {
1139         if (!info->dev || !pci_ats_enabled(info->dev))
1140                 return;
1141
1142         pci_disable_ats(info->dev);
1143 }
1144
1145 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1146                                   u64 addr, unsigned mask)
1147 {
1148         u16 sid, qdep;
1149         unsigned long flags;
1150         struct device_domain_info *info;
1151
1152         spin_lock_irqsave(&device_domain_lock, flags);
1153         list_for_each_entry(info, &domain->devices, link) {
1154                 if (!info->dev || !pci_ats_enabled(info->dev))
1155                         continue;
1156
1157                 sid = info->bus << 8 | info->devfn;
1158                 qdep = pci_ats_queue_depth(info->dev);
1159                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1160         }
1161         spin_unlock_irqrestore(&device_domain_lock, flags);
1162 }
1163
1164 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1165                                   unsigned long pfn, unsigned int pages, int map)
1166 {
1167         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1168         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1169
1170         BUG_ON(pages == 0);
1171
1172         /*
1173          * Fallback to domain selective flush if no PSI support or the size is
1174          * too big.
1175          * PSI requires page size to be 2 ^ x, and the base address is naturally
1176          * aligned to the size
1177          */
1178         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1179                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1180                                                 DMA_TLB_DSI_FLUSH);
1181         else
1182                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1183                                                 DMA_TLB_PSI_FLUSH);
1184
1185         /*
1186          * In caching mode, changes of pages from non-present to present require
1187          * flush. However, device IOTLB doesn't need to be flushed in this case.
1188          */
1189         if (!cap_caching_mode(iommu->cap) || !map)
1190                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1191 }
1192
1193 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1194 {
1195         u32 pmen;
1196         unsigned long flags;
1197
1198         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1199         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1200         pmen &= ~DMA_PMEN_EPM;
1201         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1202
1203         /* wait for the protected region status bit to clear */
1204         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1205                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1206
1207         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1208 }
1209
1210 static int iommu_enable_translation(struct intel_iommu *iommu)
1211 {
1212         u32 sts;
1213         unsigned long flags;
1214
1215         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1216         iommu->gcmd |= DMA_GCMD_TE;
1217         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1218
1219         /* Make sure hardware complete it */
1220         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1221                       readl, (sts & DMA_GSTS_TES), sts);
1222
1223         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1224         return 0;
1225 }
1226
1227 static int iommu_disable_translation(struct intel_iommu *iommu)
1228 {
1229         u32 sts;
1230         unsigned long flag;
1231
1232         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233         iommu->gcmd &= ~DMA_GCMD_TE;
1234         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1235
1236         /* Make sure hardware complete it */
1237         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238                       readl, (!(sts & DMA_GSTS_TES)), sts);
1239
1240         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241         return 0;
1242 }
1243
1244
1245 static int iommu_init_domains(struct intel_iommu *iommu)
1246 {
1247         unsigned long ndomains;
1248         unsigned long nlongs;
1249
1250         ndomains = cap_ndoms(iommu->cap);
1251         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1252                         ndomains);
1253         nlongs = BITS_TO_LONGS(ndomains);
1254
1255         spin_lock_init(&iommu->lock);
1256
1257         /* TBD: there might be 64K domains,
1258          * consider other allocation for future chip
1259          */
1260         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1261         if (!iommu->domain_ids) {
1262                 printk(KERN_ERR "Allocating domain id array failed\n");
1263                 return -ENOMEM;
1264         }
1265         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1266                         GFP_KERNEL);
1267         if (!iommu->domains) {
1268                 printk(KERN_ERR "Allocating domain array failed\n");
1269                 return -ENOMEM;
1270         }
1271
1272         /*
1273          * if Caching mode is set, then invalid translations are tagged
1274          * with domainid 0. Hence we need to pre-allocate it.
1275          */
1276         if (cap_caching_mode(iommu->cap))
1277                 set_bit(0, iommu->domain_ids);
1278         return 0;
1279 }
1280
1281
1282 static void domain_exit(struct dmar_domain *domain);
1283 static void vm_domain_exit(struct dmar_domain *domain);
1284
1285 void free_dmar_iommu(struct intel_iommu *iommu)
1286 {
1287         struct dmar_domain *domain;
1288         int i;
1289         unsigned long flags;
1290
1291         if ((iommu->domains) && (iommu->domain_ids)) {
1292                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1293                         domain = iommu->domains[i];
1294                         clear_bit(i, iommu->domain_ids);
1295
1296                         spin_lock_irqsave(&domain->iommu_lock, flags);
1297                         if (--domain->iommu_count == 0) {
1298                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1299                                         vm_domain_exit(domain);
1300                                 else
1301                                         domain_exit(domain);
1302                         }
1303                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1304                 }
1305         }
1306
1307         if (iommu->gcmd & DMA_GCMD_TE)
1308                 iommu_disable_translation(iommu);
1309
1310         if (iommu->irq) {
1311                 irq_set_handler_data(iommu->irq, NULL);
1312                 /* This will mask the irq */
1313                 free_irq(iommu->irq, iommu);
1314                 destroy_irq(iommu->irq);
1315         }
1316
1317         kfree(iommu->domains);
1318         kfree(iommu->domain_ids);
1319
1320         g_iommus[iommu->seq_id] = NULL;
1321
1322         /* if all iommus are freed, free g_iommus */
1323         for (i = 0; i < g_num_of_iommus; i++) {
1324                 if (g_iommus[i])
1325                         break;
1326         }
1327
1328         if (i == g_num_of_iommus)
1329                 kfree(g_iommus);
1330
1331         /* free context mapping */
1332         free_context_table(iommu);
1333 }
1334
1335 static struct dmar_domain *alloc_domain(void)
1336 {
1337         struct dmar_domain *domain;
1338
1339         domain = alloc_domain_mem();
1340         if (!domain)
1341                 return NULL;
1342
1343         domain->nid = -1;
1344         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1345         domain->flags = 0;
1346
1347         return domain;
1348 }
1349
1350 static int iommu_attach_domain(struct dmar_domain *domain,
1351                                struct intel_iommu *iommu)
1352 {
1353         int num;
1354         unsigned long ndomains;
1355         unsigned long flags;
1356
1357         ndomains = cap_ndoms(iommu->cap);
1358
1359         spin_lock_irqsave(&iommu->lock, flags);
1360
1361         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1362         if (num >= ndomains) {
1363                 spin_unlock_irqrestore(&iommu->lock, flags);
1364                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1365                 return -ENOMEM;
1366         }
1367
1368         domain->id = num;
1369         set_bit(num, iommu->domain_ids);
1370         set_bit(iommu->seq_id, domain->iommu_bmp);
1371         iommu->domains[num] = domain;
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373
1374         return 0;
1375 }
1376
1377 static void iommu_detach_domain(struct dmar_domain *domain,
1378                                 struct intel_iommu *iommu)
1379 {
1380         unsigned long flags;
1381         int num, ndomains;
1382         int found = 0;
1383
1384         spin_lock_irqsave(&iommu->lock, flags);
1385         ndomains = cap_ndoms(iommu->cap);
1386         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1387                 if (iommu->domains[num] == domain) {
1388                         found = 1;
1389                         break;
1390                 }
1391         }
1392
1393         if (found) {
1394                 clear_bit(num, iommu->domain_ids);
1395                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1396                 iommu->domains[num] = NULL;
1397         }
1398         spin_unlock_irqrestore(&iommu->lock, flags);
1399 }
1400
1401 static struct iova_domain reserved_iova_list;
1402 static struct lock_class_key reserved_rbtree_key;
1403
1404 static int dmar_init_reserved_ranges(void)
1405 {
1406         struct pci_dev *pdev = NULL;
1407         struct iova *iova;
1408         int i;
1409
1410         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1411
1412         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1413                 &reserved_rbtree_key);
1414
1415         /* IOAPIC ranges shouldn't be accessed by DMA */
1416         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1417                 IOVA_PFN(IOAPIC_RANGE_END));
1418         if (!iova) {
1419                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1420                 return -ENODEV;
1421         }
1422
1423         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1424         for_each_pci_dev(pdev) {
1425                 struct resource *r;
1426
1427                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1428                         r = &pdev->resource[i];
1429                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1430                                 continue;
1431                         iova = reserve_iova(&reserved_iova_list,
1432                                             IOVA_PFN(r->start),
1433                                             IOVA_PFN(r->end));
1434                         if (!iova) {
1435                                 printk(KERN_ERR "Reserve iova failed\n");
1436                                 return -ENODEV;
1437                         }
1438                 }
1439         }
1440         return 0;
1441 }
1442
1443 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1444 {
1445         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1446 }
1447
1448 static inline int guestwidth_to_adjustwidth(int gaw)
1449 {
1450         int agaw;
1451         int r = (gaw - 12) % 9;
1452
1453         if (r == 0)
1454                 agaw = gaw;
1455         else
1456                 agaw = gaw + 9 - r;
1457         if (agaw > 64)
1458                 agaw = 64;
1459         return agaw;
1460 }
1461
1462 static int domain_init(struct dmar_domain *domain, int guest_width)
1463 {
1464         struct intel_iommu *iommu;
1465         int adjust_width, agaw;
1466         unsigned long sagaw;
1467
1468         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1469         spin_lock_init(&domain->iommu_lock);
1470
1471         domain_reserve_special_ranges(domain);
1472
1473         /* calculate AGAW */
1474         iommu = domain_get_iommu(domain);
1475         if (guest_width > cap_mgaw(iommu->cap))
1476                 guest_width = cap_mgaw(iommu->cap);
1477         domain->gaw = guest_width;
1478         adjust_width = guestwidth_to_adjustwidth(guest_width);
1479         agaw = width_to_agaw(adjust_width);
1480         sagaw = cap_sagaw(iommu->cap);
1481         if (!test_bit(agaw, &sagaw)) {
1482                 /* hardware doesn't support it, choose a bigger one */
1483                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1484                 agaw = find_next_bit(&sagaw, 5, agaw);
1485                 if (agaw >= 5)
1486                         return -ENODEV;
1487         }
1488         domain->agaw = agaw;
1489         INIT_LIST_HEAD(&domain->devices);
1490
1491         if (ecap_coherent(iommu->ecap))
1492                 domain->iommu_coherency = 1;
1493         else
1494                 domain->iommu_coherency = 0;
1495
1496         if (ecap_sc_support(iommu->ecap))
1497                 domain->iommu_snooping = 1;
1498         else
1499                 domain->iommu_snooping = 0;
1500
1501         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1502         domain->iommu_count = 1;
1503         domain->nid = iommu->node;
1504
1505         /* always allocate the top pgd */
1506         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1507         if (!domain->pgd)
1508                 return -ENOMEM;
1509         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1510         return 0;
1511 }
1512
1513 static void domain_exit(struct dmar_domain *domain)
1514 {
1515         struct dmar_drhd_unit *drhd;
1516         struct intel_iommu *iommu;
1517
1518         /* Domain 0 is reserved, so dont process it */
1519         if (!domain)
1520                 return;
1521
1522         /* Flush any lazy unmaps that may reference this domain */
1523         if (!intel_iommu_strict)
1524                 flush_unmaps_timeout(0);
1525
1526         domain_remove_dev_info(domain);
1527         /* destroy iovas */
1528         put_iova_domain(&domain->iovad);
1529
1530         /* clear ptes */
1531         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1532
1533         /* free page tables */
1534         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536         for_each_active_iommu(iommu, drhd)
1537                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1538                         iommu_detach_domain(domain, iommu);
1539
1540         free_domain_mem(domain);
1541 }
1542
1543 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1544                                  u8 bus, u8 devfn, int translation)
1545 {
1546         struct context_entry *context;
1547         unsigned long flags;
1548         struct intel_iommu *iommu;
1549         struct dma_pte *pgd;
1550         unsigned long num;
1551         unsigned long ndomains;
1552         int id;
1553         int agaw;
1554         struct device_domain_info *info = NULL;
1555
1556         pr_debug("Set context mapping for %02x:%02x.%d\n",
1557                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1558
1559         BUG_ON(!domain->pgd);
1560         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1561                translation != CONTEXT_TT_MULTI_LEVEL);
1562
1563         iommu = device_to_iommu(segment, bus, devfn);
1564         if (!iommu)
1565                 return -ENODEV;
1566
1567         context = device_to_context_entry(iommu, bus, devfn);
1568         if (!context)
1569                 return -ENOMEM;
1570         spin_lock_irqsave(&iommu->lock, flags);
1571         if (context_present(context)) {
1572                 spin_unlock_irqrestore(&iommu->lock, flags);
1573                 return 0;
1574         }
1575
1576         id = domain->id;
1577         pgd = domain->pgd;
1578
1579         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1580             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1581                 int found = 0;
1582
1583                 /* find an available domain id for this device in iommu */
1584                 ndomains = cap_ndoms(iommu->cap);
1585                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1586                         if (iommu->domains[num] == domain) {
1587                                 id = num;
1588                                 found = 1;
1589                                 break;
1590                         }
1591                 }
1592
1593                 if (found == 0) {
1594                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1595                         if (num >= ndomains) {
1596                                 spin_unlock_irqrestore(&iommu->lock, flags);
1597                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1598                                 return -EFAULT;
1599                         }
1600
1601                         set_bit(num, iommu->domain_ids);
1602                         iommu->domains[num] = domain;
1603                         id = num;
1604                 }
1605
1606                 /* Skip top levels of page tables for
1607                  * iommu which has less agaw than default.
1608                  * Unnecessary for PT mode.
1609                  */
1610                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1611                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1612                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1613                                 if (!dma_pte_present(pgd)) {
1614                                         spin_unlock_irqrestore(&iommu->lock, flags);
1615                                         return -ENOMEM;
1616                                 }
1617                         }
1618                 }
1619         }
1620
1621         context_set_domain_id(context, id);
1622
1623         if (translation != CONTEXT_TT_PASS_THROUGH) {
1624                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1625                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1626                                      CONTEXT_TT_MULTI_LEVEL;
1627         }
1628         /*
1629          * In pass through mode, AW must be programmed to indicate the largest
1630          * AGAW value supported by hardware. And ASR is ignored by hardware.
1631          */
1632         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1633                 context_set_address_width(context, iommu->msagaw);
1634         else {
1635                 context_set_address_root(context, virt_to_phys(pgd));
1636                 context_set_address_width(context, iommu->agaw);
1637         }
1638
1639         context_set_translation_type(context, translation);
1640         context_set_fault_enable(context);
1641         context_set_present(context);
1642         domain_flush_cache(domain, context, sizeof(*context));
1643
1644         /*
1645          * It's a non-present to present mapping. If hardware doesn't cache
1646          * non-present entry we only need to flush the write-buffer. If the
1647          * _does_ cache non-present entries, then it does so in the special
1648          * domain #0, which we have to flush:
1649          */
1650         if (cap_caching_mode(iommu->cap)) {
1651                 iommu->flush.flush_context(iommu, 0,
1652                                            (((u16)bus) << 8) | devfn,
1653                                            DMA_CCMD_MASK_NOBIT,
1654                                            DMA_CCMD_DEVICE_INVL);
1655                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1656         } else {
1657                 iommu_flush_write_buffer(iommu);
1658         }
1659         iommu_enable_dev_iotlb(info);
1660         spin_unlock_irqrestore(&iommu->lock, flags);
1661
1662         spin_lock_irqsave(&domain->iommu_lock, flags);
1663         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1664                 domain->iommu_count++;
1665                 if (domain->iommu_count == 1)
1666                         domain->nid = iommu->node;
1667                 domain_update_iommu_cap(domain);
1668         }
1669         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1670         return 0;
1671 }
1672
1673 static int
1674 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1675                         int translation)
1676 {
1677         int ret;
1678         struct pci_dev *tmp, *parent;
1679
1680         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1681                                          pdev->bus->number, pdev->devfn,
1682                                          translation);
1683         if (ret)
1684                 return ret;
1685
1686         /* dependent device mapping */
1687         tmp = pci_find_upstream_pcie_bridge(pdev);
1688         if (!tmp)
1689                 return 0;
1690         /* Secondary interface's bus number and devfn 0 */
1691         parent = pdev->bus->self;
1692         while (parent != tmp) {
1693                 ret = domain_context_mapping_one(domain,
1694                                                  pci_domain_nr(parent->bus),
1695                                                  parent->bus->number,
1696                                                  parent->devfn, translation);
1697                 if (ret)
1698                         return ret;
1699                 parent = parent->bus->self;
1700         }
1701         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1702                 return domain_context_mapping_one(domain,
1703                                         pci_domain_nr(tmp->subordinate),
1704                                         tmp->subordinate->number, 0,
1705                                         translation);
1706         else /* this is a legacy PCI bridge */
1707                 return domain_context_mapping_one(domain,
1708                                                   pci_domain_nr(tmp->bus),
1709                                                   tmp->bus->number,
1710                                                   tmp->devfn,
1711                                                   translation);
1712 }
1713
1714 static int domain_context_mapped(struct pci_dev *pdev)
1715 {
1716         int ret;
1717         struct pci_dev *tmp, *parent;
1718         struct intel_iommu *iommu;
1719
1720         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1721                                 pdev->devfn);
1722         if (!iommu)
1723                 return -ENODEV;
1724
1725         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1726         if (!ret)
1727                 return ret;
1728         /* dependent device mapping */
1729         tmp = pci_find_upstream_pcie_bridge(pdev);
1730         if (!tmp)
1731                 return ret;
1732         /* Secondary interface's bus number and devfn 0 */
1733         parent = pdev->bus->self;
1734         while (parent != tmp) {
1735                 ret = device_context_mapped(iommu, parent->bus->number,
1736                                             parent->devfn);
1737                 if (!ret)
1738                         return ret;
1739                 parent = parent->bus->self;
1740         }
1741         if (pci_is_pcie(tmp))
1742                 return device_context_mapped(iommu, tmp->subordinate->number,
1743                                              0);
1744         else
1745                 return device_context_mapped(iommu, tmp->bus->number,
1746                                              tmp->devfn);
1747 }
1748
1749 /* Returns a number of VTD pages, but aligned to MM page size */
1750 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1751                                             size_t size)
1752 {
1753         host_addr &= ~PAGE_MASK;
1754         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1755 }
1756
1757 /* Return largest possible superpage level for a given mapping */
1758 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1759                                           unsigned long iov_pfn,
1760                                           unsigned long phy_pfn,
1761                                           unsigned long pages)
1762 {
1763         int support, level = 1;
1764         unsigned long pfnmerge;
1765
1766         support = domain->iommu_superpage;
1767
1768         /* To use a large page, the virtual *and* physical addresses
1769            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1770            of them will mean we have to use smaller pages. So just
1771            merge them and check both at once. */
1772         pfnmerge = iov_pfn | phy_pfn;
1773
1774         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1775                 pages >>= VTD_STRIDE_SHIFT;
1776                 if (!pages)
1777                         break;
1778                 pfnmerge >>= VTD_STRIDE_SHIFT;
1779                 level++;
1780                 support--;
1781         }
1782         return level;
1783 }
1784
1785 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1786                             struct scatterlist *sg, unsigned long phys_pfn,
1787                             unsigned long nr_pages, int prot)
1788 {
1789         struct dma_pte *first_pte = NULL, *pte = NULL;
1790         phys_addr_t uninitialized_var(pteval);
1791         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1792         unsigned long sg_res;
1793         unsigned int largepage_lvl = 0;
1794         unsigned long lvl_pages = 0;
1795
1796         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1797
1798         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1799                 return -EINVAL;
1800
1801         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1802
1803         if (sg)
1804                 sg_res = 0;
1805         else {
1806                 sg_res = nr_pages + 1;
1807                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1808         }
1809
1810         while (nr_pages > 0) {
1811                 uint64_t tmp;
1812
1813                 if (!sg_res) {
1814                         sg_res = aligned_nrpages(sg->offset, sg->length);
1815                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1816                         sg->dma_length = sg->length;
1817                         pteval = page_to_phys(sg_page(sg)) | prot;
1818                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1819                 }
1820
1821                 if (!pte) {
1822                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1823
1824                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1825                         if (!pte)
1826                                 return -ENOMEM;
1827                         /* It is large page*/
1828                         if (largepage_lvl > 1)
1829                                 pteval |= DMA_PTE_LARGE_PAGE;
1830                         else
1831                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1832
1833                 }
1834                 /* We don't need lock here, nobody else
1835                  * touches the iova range
1836                  */
1837                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1838                 if (tmp) {
1839                         static int dumps = 5;
1840                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1841                                iov_pfn, tmp, (unsigned long long)pteval);
1842                         if (dumps) {
1843                                 dumps--;
1844                                 debug_dma_dump_mappings(NULL);
1845                         }
1846                         WARN_ON(1);
1847                 }
1848
1849                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1850
1851                 BUG_ON(nr_pages < lvl_pages);
1852                 BUG_ON(sg_res < lvl_pages);
1853
1854                 nr_pages -= lvl_pages;
1855                 iov_pfn += lvl_pages;
1856                 phys_pfn += lvl_pages;
1857                 pteval += lvl_pages * VTD_PAGE_SIZE;
1858                 sg_res -= lvl_pages;
1859
1860                 /* If the next PTE would be the first in a new page, then we
1861                    need to flush the cache on the entries we've just written.
1862                    And then we'll need to recalculate 'pte', so clear it and
1863                    let it get set again in the if (!pte) block above.
1864
1865                    If we're done (!nr_pages) we need to flush the cache too.
1866
1867                    Also if we've been setting superpages, we may need to
1868                    recalculate 'pte' and switch back to smaller pages for the
1869                    end of the mapping, if the trailing size is not enough to
1870                    use another superpage (i.e. sg_res < lvl_pages). */
1871                 pte++;
1872                 if (!nr_pages || first_pte_in_page(pte) ||
1873                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1874                         domain_flush_cache(domain, first_pte,
1875                                            (void *)pte - (void *)first_pte);
1876                         pte = NULL;
1877                 }
1878
1879                 if (!sg_res && nr_pages)
1880                         sg = sg_next(sg);
1881         }
1882         return 0;
1883 }
1884
1885 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                                     struct scatterlist *sg, unsigned long nr_pages,
1887                                     int prot)
1888 {
1889         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1890 }
1891
1892 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1893                                      unsigned long phys_pfn, unsigned long nr_pages,
1894                                      int prot)
1895 {
1896         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1897 }
1898
1899 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1900 {
1901         if (!iommu)
1902                 return;
1903
1904         clear_context_table(iommu, bus, devfn);
1905         iommu->flush.flush_context(iommu, 0, 0, 0,
1906                                            DMA_CCMD_GLOBAL_INVL);
1907         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1908 }
1909
1910 static void domain_remove_dev_info(struct dmar_domain *domain)
1911 {
1912         struct device_domain_info *info;
1913         unsigned long flags;
1914         struct intel_iommu *iommu;
1915
1916         spin_lock_irqsave(&device_domain_lock, flags);
1917         while (!list_empty(&domain->devices)) {
1918                 info = list_entry(domain->devices.next,
1919                         struct device_domain_info, link);
1920                 list_del(&info->link);
1921                 list_del(&info->global);
1922                 if (info->dev)
1923                         info->dev->dev.archdata.iommu = NULL;
1924                 spin_unlock_irqrestore(&device_domain_lock, flags);
1925
1926                 iommu_disable_dev_iotlb(info);
1927                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1928                 iommu_detach_dev(iommu, info->bus, info->devfn);
1929                 free_devinfo_mem(info);
1930
1931                 spin_lock_irqsave(&device_domain_lock, flags);
1932         }
1933         spin_unlock_irqrestore(&device_domain_lock, flags);
1934 }
1935
1936 /*
1937  * find_domain
1938  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1939  */
1940 static struct dmar_domain *
1941 find_domain(struct pci_dev *pdev)
1942 {
1943         struct device_domain_info *info;
1944
1945         /* No lock here, assumes no domain exit in normal case */
1946         info = pdev->dev.archdata.iommu;
1947         if (info)
1948                 return info->domain;
1949         return NULL;
1950 }
1951
1952 /* domain is initialized */
1953 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1954 {
1955         struct dmar_domain *domain, *found = NULL;
1956         struct intel_iommu *iommu;
1957         struct dmar_drhd_unit *drhd;
1958         struct device_domain_info *info, *tmp;
1959         struct pci_dev *dev_tmp;
1960         unsigned long flags;
1961         int bus = 0, devfn = 0;
1962         int segment;
1963         int ret;
1964
1965         domain = find_domain(pdev);
1966         if (domain)
1967                 return domain;
1968
1969         segment = pci_domain_nr(pdev->bus);
1970
1971         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1972         if (dev_tmp) {
1973                 if (pci_is_pcie(dev_tmp)) {
1974                         bus = dev_tmp->subordinate->number;
1975                         devfn = 0;
1976                 } else {
1977                         bus = dev_tmp->bus->number;
1978                         devfn = dev_tmp->devfn;
1979                 }
1980                 spin_lock_irqsave(&device_domain_lock, flags);
1981                 list_for_each_entry(info, &device_domain_list, global) {
1982                         if (info->segment == segment &&
1983                             info->bus == bus && info->devfn == devfn) {
1984                                 found = info->domain;
1985                                 break;
1986                         }
1987                 }
1988                 spin_unlock_irqrestore(&device_domain_lock, flags);
1989                 /* pcie-pci bridge already has a domain, uses it */
1990                 if (found) {
1991                         domain = found;
1992                         goto found_domain;
1993                 }
1994         }
1995
1996         domain = alloc_domain();
1997         if (!domain)
1998                 goto error;
1999
2000         /* Allocate new domain for the device */
2001         drhd = dmar_find_matched_drhd_unit(pdev);
2002         if (!drhd) {
2003                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2004                         pci_name(pdev));
2005                 return NULL;
2006         }
2007         iommu = drhd->iommu;
2008
2009         ret = iommu_attach_domain(domain, iommu);
2010         if (ret) {
2011                 free_domain_mem(domain);
2012                 goto error;
2013         }
2014
2015         if (domain_init(domain, gaw)) {
2016                 domain_exit(domain);
2017                 goto error;
2018         }
2019
2020         /* register pcie-to-pci device */
2021         if (dev_tmp) {
2022                 info = alloc_devinfo_mem();
2023                 if (!info) {
2024                         domain_exit(domain);
2025                         goto error;
2026                 }
2027                 info->segment = segment;
2028                 info->bus = bus;
2029                 info->devfn = devfn;
2030                 info->dev = NULL;
2031                 info->domain = domain;
2032                 /* This domain is shared by devices under p2p bridge */
2033                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2034
2035                 /* pcie-to-pci bridge already has a domain, uses it */
2036                 found = NULL;
2037                 spin_lock_irqsave(&device_domain_lock, flags);
2038                 list_for_each_entry(tmp, &device_domain_list, global) {
2039                         if (tmp->segment == segment &&
2040                             tmp->bus == bus && tmp->devfn == devfn) {
2041                                 found = tmp->domain;
2042                                 break;
2043                         }
2044                 }
2045                 if (found) {
2046                         spin_unlock_irqrestore(&device_domain_lock, flags);
2047                         free_devinfo_mem(info);
2048                         domain_exit(domain);
2049                         domain = found;
2050                 } else {
2051                         list_add(&info->link, &domain->devices);
2052                         list_add(&info->global, &device_domain_list);
2053                         spin_unlock_irqrestore(&device_domain_lock, flags);
2054                 }
2055         }
2056
2057 found_domain:
2058         info = alloc_devinfo_mem();
2059         if (!info)
2060                 goto error;
2061         info->segment = segment;
2062         info->bus = pdev->bus->number;
2063         info->devfn = pdev->devfn;
2064         info->dev = pdev;
2065         info->domain = domain;
2066         spin_lock_irqsave(&device_domain_lock, flags);
2067         /* somebody is fast */
2068         found = find_domain(pdev);
2069         if (found != NULL) {
2070                 spin_unlock_irqrestore(&device_domain_lock, flags);
2071                 if (found != domain) {
2072                         domain_exit(domain);
2073                         domain = found;
2074                 }
2075                 free_devinfo_mem(info);
2076                 return domain;
2077         }
2078         list_add(&info->link, &domain->devices);
2079         list_add(&info->global, &device_domain_list);
2080         pdev->dev.archdata.iommu = info;
2081         spin_unlock_irqrestore(&device_domain_lock, flags);
2082         return domain;
2083 error:
2084         /* recheck it here, maybe others set it */
2085         return find_domain(pdev);
2086 }
2087
2088 static int iommu_identity_mapping;
2089 #define IDENTMAP_ALL            1
2090 #define IDENTMAP_GFX            2
2091 #define IDENTMAP_AZALIA         4
2092
2093 static int iommu_domain_identity_map(struct dmar_domain *domain,
2094                                      unsigned long long start,
2095                                      unsigned long long end)
2096 {
2097         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2098         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2099
2100         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2101                           dma_to_mm_pfn(last_vpfn))) {
2102                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2103                 return -ENOMEM;
2104         }
2105
2106         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2107                  start, end, domain->id);
2108         /*
2109          * RMRR range might have overlap with physical memory range,
2110          * clear it first
2111          */
2112         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2113
2114         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2115                                   last_vpfn - first_vpfn + 1,
2116                                   DMA_PTE_READ|DMA_PTE_WRITE);
2117 }
2118
2119 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2120                                       unsigned long long start,
2121                                       unsigned long long end)
2122 {
2123         struct dmar_domain *domain;
2124         int ret;
2125
2126         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2127         if (!domain)
2128                 return -ENOMEM;
2129
2130         /* For _hardware_ passthrough, don't bother. But for software
2131            passthrough, we do it anyway -- it may indicate a memory
2132            range which is reserved in E820, so which didn't get set
2133            up to start with in si_domain */
2134         if (domain == si_domain && hw_pass_through) {
2135                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2136                        pci_name(pdev), start, end);
2137                 return 0;
2138         }
2139
2140         printk(KERN_INFO
2141                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2142                pci_name(pdev), start, end);
2143         
2144         if (end < start) {
2145                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2146                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2147                         dmi_get_system_info(DMI_BIOS_VENDOR),
2148                         dmi_get_system_info(DMI_BIOS_VERSION),
2149                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2150                 ret = -EIO;
2151                 goto error;
2152         }
2153
2154         if (end >> agaw_to_width(domain->agaw)) {
2155                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2156                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2157                      agaw_to_width(domain->agaw),
2158                      dmi_get_system_info(DMI_BIOS_VENDOR),
2159                      dmi_get_system_info(DMI_BIOS_VERSION),
2160                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2161                 ret = -EIO;
2162                 goto error;
2163         }
2164
2165         ret = iommu_domain_identity_map(domain, start, end);
2166         if (ret)
2167                 goto error;
2168
2169         /* context entry init */
2170         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2171         if (ret)
2172                 goto error;
2173
2174         return 0;
2175
2176  error:
2177         domain_exit(domain);
2178         return ret;
2179 }
2180
2181 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2182         struct pci_dev *pdev)
2183 {
2184         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2185                 return 0;
2186         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2187                 rmrr->end_address);
2188 }
2189
2190 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2191 static inline void iommu_prepare_isa(void)
2192 {
2193         struct pci_dev *pdev;
2194         int ret;
2195
2196         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2197         if (!pdev)
2198                 return;
2199
2200         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2201         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2202
2203         if (ret)
2204                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2205                        "floppy might not work\n");
2206
2207 }
2208 #else
2209 static inline void iommu_prepare_isa(void)
2210 {
2211         return;
2212 }
2213 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2214
2215 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2216
2217 static int __init si_domain_init(int hw)
2218 {
2219         struct dmar_drhd_unit *drhd;
2220         struct intel_iommu *iommu;
2221         int nid, ret = 0;
2222
2223         si_domain = alloc_domain();
2224         if (!si_domain)
2225                 return -EFAULT;
2226
2227         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2228
2229         for_each_active_iommu(iommu, drhd) {
2230                 ret = iommu_attach_domain(si_domain, iommu);
2231                 if (ret) {
2232                         domain_exit(si_domain);
2233                         return -EFAULT;
2234                 }
2235         }
2236
2237         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2238                 domain_exit(si_domain);
2239                 return -EFAULT;
2240         }
2241
2242         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2243
2244         if (hw)
2245                 return 0;
2246
2247         for_each_online_node(nid) {
2248                 unsigned long start_pfn, end_pfn;
2249                 int i;
2250
2251                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2252                         ret = iommu_domain_identity_map(si_domain,
2253                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2254                         if (ret)
2255                                 return ret;
2256                 }
2257         }
2258
2259         return 0;
2260 }
2261
2262 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2263                                           struct pci_dev *pdev);
2264 static int identity_mapping(struct pci_dev *pdev)
2265 {
2266         struct device_domain_info *info;
2267
2268         if (likely(!iommu_identity_mapping))
2269                 return 0;
2270
2271         info = pdev->dev.archdata.iommu;
2272         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2273                 return (info->domain == si_domain);
2274
2275         return 0;
2276 }
2277
2278 static int domain_add_dev_info(struct dmar_domain *domain,
2279                                struct pci_dev *pdev,
2280                                int translation)
2281 {
2282         struct device_domain_info *info;
2283         unsigned long flags;
2284         int ret;
2285
2286         info = alloc_devinfo_mem();
2287         if (!info)
2288                 return -ENOMEM;
2289
2290         ret = domain_context_mapping(domain, pdev, translation);
2291         if (ret) {
2292                 free_devinfo_mem(info);
2293                 return ret;
2294         }
2295
2296         info->segment = pci_domain_nr(pdev->bus);
2297         info->bus = pdev->bus->number;
2298         info->devfn = pdev->devfn;
2299         info->dev = pdev;
2300         info->domain = domain;
2301
2302         spin_lock_irqsave(&device_domain_lock, flags);
2303         list_add(&info->link, &domain->devices);
2304         list_add(&info->global, &device_domain_list);
2305         pdev->dev.archdata.iommu = info;
2306         spin_unlock_irqrestore(&device_domain_lock, flags);
2307
2308         return 0;
2309 }
2310
2311 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2312 {
2313         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2314                 return 1;
2315
2316         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2317                 return 1;
2318
2319         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2320                 return 0;
2321
2322         /*
2323          * We want to start off with all devices in the 1:1 domain, and
2324          * take them out later if we find they can't access all of memory.
2325          *
2326          * However, we can't do this for PCI devices behind bridges,
2327          * because all PCI devices behind the same bridge will end up
2328          * with the same source-id on their transactions.
2329          *
2330          * Practically speaking, we can't change things around for these
2331          * devices at run-time, because we can't be sure there'll be no
2332          * DMA transactions in flight for any of their siblings.
2333          * 
2334          * So PCI devices (unless they're on the root bus) as well as
2335          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2336          * the 1:1 domain, just in _case_ one of their siblings turns out
2337          * not to be able to map all of memory.
2338          */
2339         if (!pci_is_pcie(pdev)) {
2340                 if (!pci_is_root_bus(pdev->bus))
2341                         return 0;
2342                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2343                         return 0;
2344         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2345                 return 0;
2346
2347         /* 
2348          * At boot time, we don't yet know if devices will be 64-bit capable.
2349          * Assume that they will -- if they turn out not to be, then we can 
2350          * take them out of the 1:1 domain later.
2351          */
2352         if (!startup) {
2353                 /*
2354                  * If the device's dma_mask is less than the system's memory
2355                  * size then this is not a candidate for identity mapping.
2356                  */
2357                 u64 dma_mask = pdev->dma_mask;
2358
2359                 if (pdev->dev.coherent_dma_mask &&
2360                     pdev->dev.coherent_dma_mask < dma_mask)
2361                         dma_mask = pdev->dev.coherent_dma_mask;
2362
2363                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2364         }
2365
2366         return 1;
2367 }
2368
2369 static int __init iommu_prepare_static_identity_mapping(int hw)
2370 {
2371         struct pci_dev *pdev = NULL;
2372         int ret;
2373
2374         ret = si_domain_init(hw);
2375         if (ret)
2376                 return -EFAULT;
2377
2378         for_each_pci_dev(pdev) {
2379                 if (iommu_should_identity_map(pdev, 1)) {
2380                         ret = domain_add_dev_info(si_domain, pdev,
2381                                              hw ? CONTEXT_TT_PASS_THROUGH :
2382                                                   CONTEXT_TT_MULTI_LEVEL);
2383                         if (ret) {
2384                                 /* device not associated with an iommu */
2385                                 if (ret == -ENODEV)
2386                                         continue;
2387                                 return ret;
2388                         }
2389                         pr_info("IOMMU: %s identity mapping for device %s\n",
2390                                 hw ? "hardware" : "software", pci_name(pdev));
2391                 }
2392         }
2393
2394         return 0;
2395 }
2396
2397 static int __init init_dmars(void)
2398 {
2399         struct dmar_drhd_unit *drhd;
2400         struct dmar_rmrr_unit *rmrr;
2401         struct pci_dev *pdev;
2402         struct intel_iommu *iommu;
2403         int i, ret;
2404
2405         /*
2406          * for each drhd
2407          *    allocate root
2408          *    initialize and program root entry to not present
2409          * endfor
2410          */
2411         for_each_drhd_unit(drhd) {
2412                 /*
2413                  * lock not needed as this is only incremented in the single
2414                  * threaded kernel __init code path all other access are read
2415                  * only
2416                  */
2417                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2418                         g_num_of_iommus++;
2419                         continue;
2420                 }
2421                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2422                           IOMMU_UNITS_SUPPORTED);
2423         }
2424
2425         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2426                         GFP_KERNEL);
2427         if (!g_iommus) {
2428                 printk(KERN_ERR "Allocating global iommu array failed\n");
2429                 ret = -ENOMEM;
2430                 goto error;
2431         }
2432
2433         deferred_flush = kzalloc(g_num_of_iommus *
2434                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2435         if (!deferred_flush) {
2436                 ret = -ENOMEM;
2437                 goto error;
2438         }
2439
2440         for_each_drhd_unit(drhd) {
2441                 if (drhd->ignored)
2442                         continue;
2443
2444                 iommu = drhd->iommu;
2445                 g_iommus[iommu->seq_id] = iommu;
2446
2447                 ret = iommu_init_domains(iommu);
2448                 if (ret)
2449                         goto error;
2450
2451                 /*
2452                  * TBD:
2453                  * we could share the same root & context tables
2454                  * among all IOMMU's. Need to Split it later.
2455                  */
2456                 ret = iommu_alloc_root_entry(iommu);
2457                 if (ret) {
2458                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2459                         goto error;
2460                 }
2461                 if (!ecap_pass_through(iommu->ecap))
2462                         hw_pass_through = 0;
2463         }
2464
2465         /*
2466          * Start from the sane iommu hardware state.
2467          */
2468         for_each_drhd_unit(drhd) {
2469                 if (drhd->ignored)
2470                         continue;
2471
2472                 iommu = drhd->iommu;
2473
2474                 /*
2475                  * If the queued invalidation is already initialized by us
2476                  * (for example, while enabling interrupt-remapping) then
2477                  * we got the things already rolling from a sane state.
2478                  */
2479                 if (iommu->qi)
2480                         continue;
2481
2482                 /*
2483                  * Clear any previous faults.
2484                  */
2485                 dmar_fault(-1, iommu);
2486                 /*
2487                  * Disable queued invalidation if supported and already enabled
2488                  * before OS handover.
2489                  */
2490                 dmar_disable_qi(iommu);
2491         }
2492
2493         for_each_drhd_unit(drhd) {
2494                 if (drhd->ignored)
2495                         continue;
2496
2497                 iommu = drhd->iommu;
2498
2499                 if (dmar_enable_qi(iommu)) {
2500                         /*
2501                          * Queued Invalidate not enabled, use Register Based
2502                          * Invalidate
2503                          */
2504                         iommu->flush.flush_context = __iommu_flush_context;
2505                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2506                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2507                                "invalidation\n",
2508                                 iommu->seq_id,
2509                                (unsigned long long)drhd->reg_base_addr);
2510                 } else {
2511                         iommu->flush.flush_context = qi_flush_context;
2512                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2513                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2514                                "invalidation\n",
2515                                 iommu->seq_id,
2516                                (unsigned long long)drhd->reg_base_addr);
2517                 }
2518         }
2519
2520         if (iommu_pass_through)
2521                 iommu_identity_mapping |= IDENTMAP_ALL;
2522
2523 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2524         iommu_identity_mapping |= IDENTMAP_GFX;
2525 #endif
2526
2527         check_tylersburg_isoch();
2528
2529         /*
2530          * If pass through is not set or not enabled, setup context entries for
2531          * identity mappings for rmrr, gfx, and isa and may fall back to static
2532          * identity mapping if iommu_identity_mapping is set.
2533          */
2534         if (iommu_identity_mapping) {
2535                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2536                 if (ret) {
2537                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2538                         goto error;
2539                 }
2540         }
2541         /*
2542          * For each rmrr
2543          *   for each dev attached to rmrr
2544          *   do
2545          *     locate drhd for dev, alloc domain for dev
2546          *     allocate free domain
2547          *     allocate page table entries for rmrr
2548          *     if context not allocated for bus
2549          *           allocate and init context
2550          *           set present in root table for this bus
2551          *     init context with domain, translation etc
2552          *    endfor
2553          * endfor
2554          */
2555         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2556         for_each_rmrr_units(rmrr) {
2557                 for (i = 0; i < rmrr->devices_cnt; i++) {
2558                         pdev = rmrr->devices[i];
2559                         /*
2560                          * some BIOS lists non-exist devices in DMAR
2561                          * table.
2562                          */
2563                         if (!pdev)
2564                                 continue;
2565                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2566                         if (ret)
2567                                 printk(KERN_ERR
2568                                        "IOMMU: mapping reserved region failed\n");
2569                 }
2570         }
2571
2572         iommu_prepare_isa();
2573
2574         /*
2575          * for each drhd
2576          *   enable fault log
2577          *   global invalidate context cache
2578          *   global invalidate iotlb
2579          *   enable translation
2580          */
2581         for_each_drhd_unit(drhd) {
2582                 if (drhd->ignored) {
2583                         /*
2584                          * we always have to disable PMRs or DMA may fail on
2585                          * this device
2586                          */
2587                         if (force_on)
2588                                 iommu_disable_protect_mem_regions(drhd->iommu);
2589                         continue;
2590                 }
2591                 iommu = drhd->iommu;
2592
2593                 iommu_flush_write_buffer(iommu);
2594
2595                 ret = dmar_set_interrupt(iommu);
2596                 if (ret)
2597                         goto error;
2598
2599                 iommu_set_root_entry(iommu);
2600
2601                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2602                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2603
2604                 ret = iommu_enable_translation(iommu);
2605                 if (ret)
2606                         goto error;
2607
2608                 iommu_disable_protect_mem_regions(iommu);
2609         }
2610
2611         return 0;
2612 error:
2613         for_each_drhd_unit(drhd) {
2614                 if (drhd->ignored)
2615                         continue;
2616                 iommu = drhd->iommu;
2617                 free_iommu(iommu);
2618         }
2619         kfree(g_iommus);
2620         return ret;
2621 }
2622
2623 /* This takes a number of _MM_ pages, not VTD pages */
2624 static struct iova *intel_alloc_iova(struct device *dev,
2625                                      struct dmar_domain *domain,
2626                                      unsigned long nrpages, uint64_t dma_mask)
2627 {
2628         struct pci_dev *pdev = to_pci_dev(dev);
2629         struct iova *iova = NULL;
2630
2631         /* Restrict dma_mask to the width that the iommu can handle */
2632         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2633
2634         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2635                 /*
2636                  * First try to allocate an io virtual address in
2637                  * DMA_BIT_MASK(32) and if that fails then try allocating
2638                  * from higher range
2639                  */
2640                 iova = alloc_iova(&domain->iovad, nrpages,
2641                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2642                 if (iova)
2643                         return iova;
2644         }
2645         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2646         if (unlikely(!iova)) {
2647                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2648                        nrpages, pci_name(pdev));
2649                 return NULL;
2650         }
2651
2652         return iova;
2653 }
2654
2655 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2656 {
2657         struct dmar_domain *domain;
2658         int ret;
2659
2660         domain = get_domain_for_dev(pdev,
2661                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2662         if (!domain) {
2663                 printk(KERN_ERR
2664                         "Allocating domain for %s failed", pci_name(pdev));
2665                 return NULL;
2666         }
2667
2668         /* make sure context mapping is ok */
2669         if (unlikely(!domain_context_mapped(pdev))) {
2670                 ret = domain_context_mapping(domain, pdev,
2671                                              CONTEXT_TT_MULTI_LEVEL);
2672                 if (ret) {
2673                         printk(KERN_ERR
2674                                 "Domain context map for %s failed",
2675                                 pci_name(pdev));
2676                         return NULL;
2677                 }
2678         }
2679
2680         return domain;
2681 }
2682
2683 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2684 {
2685         struct device_domain_info *info;
2686
2687         /* No lock here, assumes no domain exit in normal case */
2688         info = dev->dev.archdata.iommu;
2689         if (likely(info))
2690                 return info->domain;
2691
2692         return __get_valid_domain_for_dev(dev);
2693 }
2694
2695 static int iommu_dummy(struct pci_dev *pdev)
2696 {
2697         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2698 }
2699
2700 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2701 static int iommu_no_mapping(struct device *dev)
2702 {
2703         struct pci_dev *pdev;
2704         int found;
2705
2706         if (unlikely(dev->bus != &pci_bus_type))
2707                 return 1;
2708
2709         pdev = to_pci_dev(dev);
2710         if (iommu_dummy(pdev))
2711                 return 1;
2712
2713         if (!iommu_identity_mapping)
2714                 return 0;
2715
2716         found = identity_mapping(pdev);
2717         if (found) {
2718                 if (iommu_should_identity_map(pdev, 0))
2719                         return 1;
2720                 else {
2721                         /*
2722                          * 32 bit DMA is removed from si_domain and fall back
2723                          * to non-identity mapping.
2724                          */
2725                         domain_remove_one_dev_info(si_domain, pdev);
2726                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2727                                pci_name(pdev));
2728                         return 0;
2729                 }
2730         } else {
2731                 /*
2732                  * In case of a detached 64 bit DMA device from vm, the device
2733                  * is put into si_domain for identity mapping.
2734                  */
2735                 if (iommu_should_identity_map(pdev, 0)) {
2736                         int ret;
2737                         ret = domain_add_dev_info(si_domain, pdev,
2738                                                   hw_pass_through ?
2739                                                   CONTEXT_TT_PASS_THROUGH :
2740                                                   CONTEXT_TT_MULTI_LEVEL);
2741                         if (!ret) {
2742                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2743                                        pci_name(pdev));
2744                                 return 1;
2745                         }
2746                 }
2747         }
2748
2749         return 0;
2750 }
2751
2752 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2753                                      size_t size, int dir, u64 dma_mask)
2754 {
2755         struct pci_dev *pdev = to_pci_dev(hwdev);
2756         struct dmar_domain *domain;
2757         phys_addr_t start_paddr;
2758         struct iova *iova;
2759         int prot = 0;
2760         int ret;
2761         struct intel_iommu *iommu;
2762         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2763
2764         BUG_ON(dir == DMA_NONE);
2765
2766         if (iommu_no_mapping(hwdev))
2767                 return paddr;
2768
2769         domain = get_valid_domain_for_dev(pdev);
2770         if (!domain)
2771                 return 0;
2772
2773         iommu = domain_get_iommu(domain);
2774         size = aligned_nrpages(paddr, size);
2775
2776         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2777         if (!iova)
2778                 goto error;
2779
2780         /*
2781          * Check if DMAR supports zero-length reads on write only
2782          * mappings..
2783          */
2784         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2785                         !cap_zlr(iommu->cap))
2786                 prot |= DMA_PTE_READ;
2787         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2788                 prot |= DMA_PTE_WRITE;
2789         /*
2790          * paddr - (paddr + size) might be partial page, we should map the whole
2791          * page.  Note: if two part of one page are separately mapped, we
2792          * might have two guest_addr mapping to the same host paddr, but this
2793          * is not a big problem
2794          */
2795         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2796                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2797         if (ret)
2798                 goto error;
2799
2800         /* it's a non-present to present mapping. Only flush if caching mode */
2801         if (cap_caching_mode(iommu->cap))
2802                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2803         else
2804                 iommu_flush_write_buffer(iommu);
2805
2806         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2807         start_paddr += paddr & ~PAGE_MASK;
2808         return start_paddr;
2809
2810 error:
2811         if (iova)
2812                 __free_iova(&domain->iovad, iova);
2813         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2814                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2815         return 0;
2816 }
2817
2818 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2819                                  unsigned long offset, size_t size,
2820                                  enum dma_data_direction dir,
2821                                  struct dma_attrs *attrs)
2822 {
2823         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2824                                   dir, to_pci_dev(dev)->dma_mask);
2825 }
2826
2827 static void flush_unmaps(void)
2828 {
2829         int i, j;
2830
2831         timer_on = 0;
2832
2833         /* just flush them all */
2834         for (i = 0; i < g_num_of_iommus; i++) {
2835                 struct intel_iommu *iommu = g_iommus[i];
2836                 if (!iommu)
2837                         continue;
2838
2839                 if (!deferred_flush[i].next)
2840                         continue;
2841
2842                 /* In caching mode, global flushes turn emulation expensive */
2843                 if (!cap_caching_mode(iommu->cap))
2844                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2845                                          DMA_TLB_GLOBAL_FLUSH);
2846                 for (j = 0; j < deferred_flush[i].next; j++) {
2847                         unsigned long mask;
2848                         struct iova *iova = deferred_flush[i].iova[j];
2849                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2850
2851                         /* On real hardware multiple invalidations are expensive */
2852                         if (cap_caching_mode(iommu->cap))
2853                                 iommu_flush_iotlb_psi(iommu, domain->id,
2854                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2855                         else {
2856                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2857                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2858                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2859                         }
2860                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2861                 }
2862                 deferred_flush[i].next = 0;
2863         }
2864
2865         list_size = 0;
2866 }
2867
2868 static void flush_unmaps_timeout(unsigned long data)
2869 {
2870         unsigned long flags;
2871
2872         spin_lock_irqsave(&async_umap_flush_lock, flags);
2873         flush_unmaps();
2874         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2875 }
2876
2877 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2878 {
2879         unsigned long flags;
2880         int next, iommu_id;
2881         struct intel_iommu *iommu;
2882
2883         spin_lock_irqsave(&async_umap_flush_lock, flags);
2884         if (list_size == HIGH_WATER_MARK)
2885                 flush_unmaps();
2886
2887         iommu = domain_get_iommu(dom);
2888         iommu_id = iommu->seq_id;
2889
2890         next = deferred_flush[iommu_id].next;
2891         deferred_flush[iommu_id].domain[next] = dom;
2892         deferred_flush[iommu_id].iova[next] = iova;
2893         deferred_flush[iommu_id].next++;
2894
2895         if (!timer_on) {
2896                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2897                 timer_on = 1;
2898         }
2899         list_size++;
2900         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2901 }
2902
2903 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2904                              size_t size, enum dma_data_direction dir,
2905                              struct dma_attrs *attrs)
2906 {
2907         struct pci_dev *pdev = to_pci_dev(dev);
2908         struct dmar_domain *domain;
2909         unsigned long start_pfn, last_pfn;
2910         struct iova *iova;
2911         struct intel_iommu *iommu;
2912
2913         if (iommu_no_mapping(dev))
2914                 return;
2915
2916         domain = find_domain(pdev);
2917         BUG_ON(!domain);
2918
2919         iommu = domain_get_iommu(domain);
2920
2921         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2922         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2923                       (unsigned long long)dev_addr))
2924                 return;
2925
2926         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2927         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2928
2929         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2930                  pci_name(pdev), start_pfn, last_pfn);
2931
2932         /*  clear the whole page */
2933         dma_pte_clear_range(domain, start_pfn, last_pfn);
2934
2935         /* free page tables */
2936         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2937
2938         if (intel_iommu_strict) {
2939                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2940                                       last_pfn - start_pfn + 1, 0);
2941                 /* free iova */
2942                 __free_iova(&domain->iovad, iova);
2943         } else {
2944                 add_unmap(domain, iova);
2945                 /*
2946                  * queue up the release of the unmap to save the 1/6th of the
2947                  * cpu used up by the iotlb flush operation...
2948                  */
2949         }
2950 }
2951
2952 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2953                                   dma_addr_t *dma_handle, gfp_t flags,
2954                                   struct dma_attrs *attrs)
2955 {
2956         void *vaddr;
2957         int order;
2958
2959         size = PAGE_ALIGN(size);
2960         order = get_order(size);
2961
2962         if (!iommu_no_mapping(hwdev))
2963                 flags &= ~(GFP_DMA | GFP_DMA32);
2964         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2965                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2966                         flags |= GFP_DMA;
2967                 else
2968                         flags |= GFP_DMA32;
2969         }
2970
2971         vaddr = (void *)__get_free_pages(flags, order);
2972         if (!vaddr)
2973                 return NULL;
2974         memset(vaddr, 0, size);
2975
2976         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2977                                          DMA_BIDIRECTIONAL,
2978                                          hwdev->coherent_dma_mask);
2979         if (*dma_handle)
2980                 return vaddr;
2981         free_pages((unsigned long)vaddr, order);
2982         return NULL;
2983 }
2984
2985 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2986                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
2987 {
2988         int order;
2989
2990         size = PAGE_ALIGN(size);
2991         order = get_order(size);
2992
2993         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2994         free_pages((unsigned long)vaddr, order);
2995 }
2996
2997 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2998                            int nelems, enum dma_data_direction dir,
2999                            struct dma_attrs *attrs)
3000 {
3001         struct pci_dev *pdev = to_pci_dev(hwdev);
3002         struct dmar_domain *domain;
3003         unsigned long start_pfn, last_pfn;
3004         struct iova *iova;
3005         struct intel_iommu *iommu;
3006
3007         if (iommu_no_mapping(hwdev))
3008                 return;
3009
3010         domain = find_domain(pdev);
3011         BUG_ON(!domain);
3012
3013         iommu = domain_get_iommu(domain);
3014
3015         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3016         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3017                       (unsigned long long)sglist[0].dma_address))
3018                 return;
3019
3020         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3021         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3022
3023         /*  clear the whole page */
3024         dma_pte_clear_range(domain, start_pfn, last_pfn);
3025
3026         /* free page tables */
3027         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3028
3029         if (intel_iommu_strict) {
3030                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3031                                       last_pfn - start_pfn + 1, 0);
3032                 /* free iova */
3033                 __free_iova(&domain->iovad, iova);
3034         } else {
3035                 add_unmap(domain, iova);
3036                 /*
3037                  * queue up the release of the unmap to save the 1/6th of the
3038                  * cpu used up by the iotlb flush operation...
3039                  */
3040         }
3041 }
3042
3043 static int intel_nontranslate_map_sg(struct device *hddev,
3044         struct scatterlist *sglist, int nelems, int dir)
3045 {
3046         int i;
3047         struct scatterlist *sg;
3048
3049         for_each_sg(sglist, sg, nelems, i) {
3050                 BUG_ON(!sg_page(sg));
3051                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3052                 sg->dma_length = sg->length;
3053         }
3054         return nelems;
3055 }
3056
3057 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3058                         enum dma_data_direction dir, struct dma_attrs *attrs)
3059 {
3060         int i;
3061         struct pci_dev *pdev = to_pci_dev(hwdev);
3062         struct dmar_domain *domain;
3063         size_t size = 0;
3064         int prot = 0;
3065         struct iova *iova = NULL;
3066         int ret;
3067         struct scatterlist *sg;
3068         unsigned long start_vpfn;
3069         struct intel_iommu *iommu;
3070
3071         BUG_ON(dir == DMA_NONE);
3072         if (iommu_no_mapping(hwdev))
3073                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3074
3075         domain = get_valid_domain_for_dev(pdev);
3076         if (!domain)
3077                 return 0;
3078
3079         iommu = domain_get_iommu(domain);
3080
3081         for_each_sg(sglist, sg, nelems, i)
3082                 size += aligned_nrpages(sg->offset, sg->length);
3083
3084         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3085                                 pdev->dma_mask);
3086         if (!iova) {
3087                 sglist->dma_length = 0;
3088                 return 0;
3089         }
3090
3091         /*
3092          * Check if DMAR supports zero-length reads on write only
3093          * mappings..
3094          */
3095         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3096                         !cap_zlr(iommu->cap))
3097                 prot |= DMA_PTE_READ;
3098         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3099                 prot |= DMA_PTE_WRITE;
3100
3101         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3102
3103         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3104         if (unlikely(ret)) {
3105                 /*  clear the page */
3106                 dma_pte_clear_range(domain, start_vpfn,
3107                                     start_vpfn + size - 1);
3108                 /* free page tables */
3109                 dma_pte_free_pagetable(domain, start_vpfn,
3110                                        start_vpfn + size - 1);
3111                 /* free iova */
3112                 __free_iova(&domain->iovad, iova);
3113                 return 0;
3114         }
3115
3116         /* it's a non-present to present mapping. Only flush if caching mode */
3117         if (cap_caching_mode(iommu->cap))
3118                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3119         else
3120                 iommu_flush_write_buffer(iommu);
3121
3122         return nelems;
3123 }
3124
3125 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3126 {
3127         return !dma_addr;
3128 }
3129
3130 struct dma_map_ops intel_dma_ops = {
3131         .alloc = intel_alloc_coherent,
3132         .free = intel_free_coherent,
3133         .map_sg = intel_map_sg,
3134         .unmap_sg = intel_unmap_sg,
3135         .map_page = intel_map_page,
3136         .unmap_page = intel_unmap_page,
3137         .mapping_error = intel_mapping_error,
3138 };
3139
3140 static inline int iommu_domain_cache_init(void)
3141 {
3142         int ret = 0;
3143
3144         iommu_domain_cache = kmem_cache_create("iommu_domain",
3145                                          sizeof(struct dmar_domain),
3146                                          0,
3147                                          SLAB_HWCACHE_ALIGN,
3148
3149                                          NULL);
3150         if (!iommu_domain_cache) {
3151                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3152                 ret = -ENOMEM;
3153         }
3154
3155         return ret;
3156 }
3157
3158 static inline int iommu_devinfo_cache_init(void)
3159 {
3160         int ret = 0;
3161
3162         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3163                                          sizeof(struct device_domain_info),
3164                                          0,
3165                                          SLAB_HWCACHE_ALIGN,
3166                                          NULL);
3167         if (!iommu_devinfo_cache) {
3168                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3169                 ret = -ENOMEM;
3170         }
3171
3172         return ret;
3173 }
3174
3175 static inline int iommu_iova_cache_init(void)
3176 {
3177         int ret = 0;
3178
3179         iommu_iova_cache = kmem_cache_create("iommu_iova",
3180                                          sizeof(struct iova),
3181                                          0,
3182                                          SLAB_HWCACHE_ALIGN,
3183                                          NULL);
3184         if (!iommu_iova_cache) {
3185                 printk(KERN_ERR "Couldn't create iova cache\n");
3186                 ret = -ENOMEM;
3187         }
3188
3189         return ret;
3190 }
3191
3192 static int __init iommu_init_mempool(void)
3193 {
3194         int ret;
3195         ret = iommu_iova_cache_init();
3196         if (ret)
3197                 return ret;
3198
3199         ret = iommu_domain_cache_init();
3200         if (ret)
3201                 goto domain_error;
3202
3203         ret = iommu_devinfo_cache_init();
3204         if (!ret)
3205                 return ret;
3206
3207         kmem_cache_destroy(iommu_domain_cache);
3208 domain_error:
3209         kmem_cache_destroy(iommu_iova_cache);
3210
3211         return -ENOMEM;
3212 }
3213
3214 static void __init iommu_exit_mempool(void)
3215 {
3216         kmem_cache_destroy(iommu_devinfo_cache);
3217         kmem_cache_destroy(iommu_domain_cache);
3218         kmem_cache_destroy(iommu_iova_cache);
3219
3220 }
3221
3222 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3223 {
3224         struct dmar_drhd_unit *drhd;
3225         u32 vtbar;
3226         int rc;
3227
3228         /* We know that this device on this chipset has its own IOMMU.
3229          * If we find it under a different IOMMU, then the BIOS is lying
3230          * to us. Hope that the IOMMU for this device is actually
3231          * disabled, and it needs no translation...
3232          */
3233         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3234         if (rc) {
3235                 /* "can't" happen */
3236                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3237                 return;
3238         }
3239         vtbar &= 0xffff0000;
3240
3241         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3242         drhd = dmar_find_matched_drhd_unit(pdev);
3243         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3244                             TAINT_FIRMWARE_WORKAROUND,
3245                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3246                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3247 }
3248 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3249
3250 static void __init init_no_remapping_devices(void)
3251 {
3252         struct dmar_drhd_unit *drhd;
3253
3254         for_each_drhd_unit(drhd) {
3255                 if (!drhd->include_all) {
3256                         int i;
3257                         for (i = 0; i < drhd->devices_cnt; i++)
3258                                 if (drhd->devices[i] != NULL)
3259                                         break;
3260                         /* ignore DMAR unit if no pci devices exist */
3261                         if (i == drhd->devices_cnt)
3262                                 drhd->ignored = 1;
3263                 }
3264         }
3265
3266         for_each_drhd_unit(drhd) {
3267                 int i;
3268                 if (drhd->ignored || drhd->include_all)
3269                         continue;
3270
3271                 for (i = 0; i < drhd->devices_cnt; i++)
3272                         if (drhd->devices[i] &&
3273                             !IS_GFX_DEVICE(drhd->devices[i]))
3274                                 break;
3275
3276                 if (i < drhd->devices_cnt)
3277                         continue;
3278
3279                 /* This IOMMU has *only* gfx devices. Either bypass it or
3280                    set the gfx_mapped flag, as appropriate */
3281                 if (dmar_map_gfx) {
3282                         intel_iommu_gfx_mapped = 1;
3283                 } else {
3284                         drhd->ignored = 1;
3285                         for (i = 0; i < drhd->devices_cnt; i++) {
3286                                 if (!drhd->devices[i])
3287                                         continue;
3288                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3289                         }
3290                 }
3291         }
3292 }
3293
3294 #ifdef CONFIG_SUSPEND
3295 static int init_iommu_hw(void)
3296 {
3297         struct dmar_drhd_unit *drhd;
3298         struct intel_iommu *iommu = NULL;
3299
3300         for_each_active_iommu(iommu, drhd)
3301                 if (iommu->qi)
3302                         dmar_reenable_qi(iommu);
3303
3304         for_each_iommu(iommu, drhd) {
3305                 if (drhd->ignored) {
3306                         /*
3307                          * we always have to disable PMRs or DMA may fail on
3308                          * this device
3309                          */
3310                         if (force_on)
3311                                 iommu_disable_protect_mem_regions(iommu);
3312                         continue;
3313                 }
3314         
3315                 iommu_flush_write_buffer(iommu);
3316
3317                 iommu_set_root_entry(iommu);
3318
3319                 iommu->flush.flush_context(iommu, 0, 0, 0,
3320                                            DMA_CCMD_GLOBAL_INVL);
3321                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3322                                          DMA_TLB_GLOBAL_FLUSH);
3323                 if (iommu_enable_translation(iommu))
3324                         return 1;
3325                 iommu_disable_protect_mem_regions(iommu);
3326         }
3327
3328         return 0;
3329 }
3330
3331 static void iommu_flush_all(void)
3332 {
3333         struct dmar_drhd_unit *drhd;
3334         struct intel_iommu *iommu;
3335
3336         for_each_active_iommu(iommu, drhd) {
3337                 iommu->flush.flush_context(iommu, 0, 0, 0,
3338                                            DMA_CCMD_GLOBAL_INVL);
3339                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3340                                          DMA_TLB_GLOBAL_FLUSH);
3341         }
3342 }
3343
3344 static int iommu_suspend(void)
3345 {
3346         struct dmar_drhd_unit *drhd;
3347         struct intel_iommu *iommu = NULL;
3348         unsigned long flag;
3349
3350         for_each_active_iommu(iommu, drhd) {
3351                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3352                                                  GFP_ATOMIC);
3353                 if (!iommu->iommu_state)
3354                         goto nomem;
3355         }
3356
3357         iommu_flush_all();
3358
3359         for_each_active_iommu(iommu, drhd) {
3360                 iommu_disable_translation(iommu);
3361
3362                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3363
3364                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3365                         readl(iommu->reg + DMAR_FECTL_REG);
3366                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3367                         readl(iommu->reg + DMAR_FEDATA_REG);
3368                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3369                         readl(iommu->reg + DMAR_FEADDR_REG);
3370                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3371                         readl(iommu->reg + DMAR_FEUADDR_REG);
3372
3373                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3374         }
3375         return 0;
3376
3377 nomem:
3378         for_each_active_iommu(iommu, drhd)
3379                 kfree(iommu->iommu_state);
3380
3381         return -ENOMEM;
3382 }
3383
3384 static void iommu_resume(void)
3385 {
3386         struct dmar_drhd_unit *drhd;
3387         struct intel_iommu *iommu = NULL;
3388         unsigned long flag;
3389
3390         if (init_iommu_hw()) {
3391                 if (force_on)
3392                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3393                 else
3394                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3395                 return;
3396         }
3397
3398         for_each_active_iommu(iommu, drhd) {
3399
3400                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3401
3402                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3403                         iommu->reg + DMAR_FECTL_REG);
3404                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3405                         iommu->reg + DMAR_FEDATA_REG);
3406                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3407                         iommu->reg + DMAR_FEADDR_REG);
3408                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3409                         iommu->reg + DMAR_FEUADDR_REG);
3410
3411                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3412         }
3413
3414         for_each_active_iommu(iommu, drhd)
3415                 kfree(iommu->iommu_state);
3416 }
3417
3418 static struct syscore_ops iommu_syscore_ops = {
3419         .resume         = iommu_resume,
3420         .suspend        = iommu_suspend,
3421 };
3422
3423 static void __init init_iommu_pm_ops(void)
3424 {
3425         register_syscore_ops(&iommu_syscore_ops);
3426 }
3427
3428 #else
3429 static inline void init_iommu_pm_ops(void) {}
3430 #endif  /* CONFIG_PM */
3431
3432 LIST_HEAD(dmar_rmrr_units);
3433
3434 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3435 {
3436         list_add(&rmrr->list, &dmar_rmrr_units);
3437 }
3438
3439
3440 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3441 {
3442         struct acpi_dmar_reserved_memory *rmrr;
3443         struct dmar_rmrr_unit *rmrru;
3444
3445         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3446         if (!rmrru)
3447                 return -ENOMEM;
3448
3449         rmrru->hdr = header;
3450         rmrr = (struct acpi_dmar_reserved_memory *)header;
3451         rmrru->base_address = rmrr->base_address;
3452         rmrru->end_address = rmrr->end_address;
3453
3454         dmar_register_rmrr_unit(rmrru);
3455         return 0;
3456 }
3457
3458 static int __init
3459 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3460 {
3461         struct acpi_dmar_reserved_memory *rmrr;
3462         int ret;
3463
3464         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3465         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3466                 ((void *)rmrr) + rmrr->header.length,
3467                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3468
3469         if (ret || (rmrru->devices_cnt == 0)) {
3470                 list_del(&rmrru->list);
3471                 kfree(rmrru);
3472         }
3473         return ret;
3474 }
3475
3476 static LIST_HEAD(dmar_atsr_units);
3477
3478 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3479 {
3480         struct acpi_dmar_atsr *atsr;
3481         struct dmar_atsr_unit *atsru;
3482
3483         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3484         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3485         if (!atsru)
3486                 return -ENOMEM;
3487
3488         atsru->hdr = hdr;
3489         atsru->include_all = atsr->flags & 0x1;
3490
3491         list_add(&atsru->list, &dmar_atsr_units);
3492
3493         return 0;
3494 }
3495
3496 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3497 {
3498         int rc;
3499         struct acpi_dmar_atsr *atsr;
3500
3501         if (atsru->include_all)
3502                 return 0;
3503
3504         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3505         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3506                                 (void *)atsr + atsr->header.length,
3507                                 &atsru->devices_cnt, &atsru->devices,
3508                                 atsr->segment);
3509         if (rc || !atsru->devices_cnt) {
3510                 list_del(&atsru->list);
3511                 kfree(atsru);
3512         }
3513
3514         return rc;
3515 }
3516
3517 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3518 {
3519         int i;
3520         struct pci_bus *bus;
3521         struct acpi_dmar_atsr *atsr;
3522         struct dmar_atsr_unit *atsru;
3523
3524         dev = pci_physfn(dev);
3525
3526         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3527                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3528                 if (atsr->segment == pci_domain_nr(dev->bus))
3529                         goto found;
3530         }
3531
3532         return 0;
3533
3534 found:
3535         for (bus = dev->bus; bus; bus = bus->parent) {
3536                 struct pci_dev *bridge = bus->self;
3537
3538                 if (!bridge || !pci_is_pcie(bridge) ||
3539                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3540                         return 0;
3541
3542                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3543                         for (i = 0; i < atsru->devices_cnt; i++)
3544                                 if (atsru->devices[i] == bridge)
3545                                         return 1;
3546                         break;
3547                 }
3548         }
3549
3550         if (atsru->include_all)
3551                 return 1;
3552
3553         return 0;
3554 }
3555
3556 int __init dmar_parse_rmrr_atsr_dev(void)
3557 {
3558         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3559         struct dmar_atsr_unit *atsr, *atsr_n;
3560         int ret = 0;
3561
3562         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3563                 ret = rmrr_parse_dev(rmrr);
3564                 if (ret)
3565                         return ret;
3566         }
3567
3568         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3569                 ret = atsr_parse_dev(atsr);
3570                 if (ret)
3571                         return ret;
3572         }
3573
3574         return ret;
3575 }
3576
3577 /*
3578  * Here we only respond to action of unbound device from driver.
3579  *
3580  * Added device is not attached to its DMAR domain here yet. That will happen
3581  * when mapping the device to iova.
3582  */
3583 static int device_notifier(struct notifier_block *nb,
3584                                   unsigned long action, void *data)
3585 {
3586         struct device *dev = data;
3587         struct pci_dev *pdev = to_pci_dev(dev);
3588         struct dmar_domain *domain;
3589
3590         if (iommu_no_mapping(dev))
3591                 return 0;
3592
3593         domain = find_domain(pdev);
3594         if (!domain)
3595                 return 0;
3596
3597         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3598                 domain_remove_one_dev_info(domain, pdev);
3599
3600                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3601                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3602                     list_empty(&domain->devices))
3603                         domain_exit(domain);
3604         }
3605
3606         return 0;
3607 }
3608
3609 static struct notifier_block device_nb = {
3610         .notifier_call = device_notifier,
3611 };
3612
3613 int __init intel_iommu_init(void)
3614 {
3615         int ret = 0;
3616
3617         /* VT-d is required for a TXT/tboot launch, so enforce that */
3618         force_on = tboot_force_iommu();
3619
3620         if (dmar_table_init()) {
3621                 if (force_on)
3622                         panic("tboot: Failed to initialize DMAR table\n");
3623                 return  -ENODEV;
3624         }
3625
3626         if (dmar_dev_scope_init() < 0) {
3627                 if (force_on)
3628                         panic("tboot: Failed to initialize DMAR device scope\n");
3629                 return  -ENODEV;
3630         }
3631
3632         if (no_iommu || dmar_disabled)
3633                 return -ENODEV;
3634
3635         if (iommu_init_mempool()) {
3636                 if (force_on)
3637                         panic("tboot: Failed to initialize iommu memory\n");
3638                 return  -ENODEV;
3639         }
3640
3641         if (list_empty(&dmar_rmrr_units))
3642                 printk(KERN_INFO "DMAR: No RMRR found\n");
3643
3644         if (list_empty(&dmar_atsr_units))
3645                 printk(KERN_INFO "DMAR: No ATSR found\n");
3646
3647         if (dmar_init_reserved_ranges()) {
3648                 if (force_on)
3649                         panic("tboot: Failed to reserve iommu ranges\n");
3650                 return  -ENODEV;
3651         }
3652
3653         init_no_remapping_devices();
3654
3655         ret = init_dmars();
3656         if (ret) {
3657                 if (force_on)
3658                         panic("tboot: Failed to initialize DMARs\n");
3659                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3660                 put_iova_domain(&reserved_iova_list);
3661                 iommu_exit_mempool();
3662                 return ret;
3663         }
3664         printk(KERN_INFO
3665         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3666
3667         init_timer(&unmap_timer);
3668 #ifdef CONFIG_SWIOTLB
3669         swiotlb = 0;
3670 #endif
3671         dma_ops = &intel_dma_ops;
3672
3673         init_iommu_pm_ops();
3674
3675         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3676
3677         bus_register_notifier(&pci_bus_type, &device_nb);
3678
3679         intel_iommu_enabled = 1;
3680
3681         return 0;
3682 }
3683
3684 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3685                                            struct pci_dev *pdev)
3686 {
3687         struct pci_dev *tmp, *parent;
3688
3689         if (!iommu || !pdev)
3690                 return;
3691
3692         /* dependent device detach */
3693         tmp = pci_find_upstream_pcie_bridge(pdev);
3694         /* Secondary interface's bus number and devfn 0 */
3695         if (tmp) {
3696                 parent = pdev->bus->self;
3697                 while (parent != tmp) {
3698                         iommu_detach_dev(iommu, parent->bus->number,
3699                                          parent->devfn);
3700                         parent = parent->bus->self;
3701                 }
3702                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3703                         iommu_detach_dev(iommu,
3704                                 tmp->subordinate->number, 0);
3705                 else /* this is a legacy PCI bridge */
3706                         iommu_detach_dev(iommu, tmp->bus->number,
3707                                          tmp->devfn);
3708         }
3709 }
3710
3711 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3712                                           struct pci_dev *pdev)
3713 {
3714         struct device_domain_info *info;
3715         struct intel_iommu *iommu;
3716         unsigned long flags;
3717         int found = 0;
3718         struct list_head *entry, *tmp;
3719
3720         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3721                                 pdev->devfn);
3722         if (!iommu)
3723                 return;
3724
3725         spin_lock_irqsave(&device_domain_lock, flags);
3726         list_for_each_safe(entry, tmp, &domain->devices) {
3727                 info = list_entry(entry, struct device_domain_info, link);
3728                 if (info->segment == pci_domain_nr(pdev->bus) &&
3729                     info->bus == pdev->bus->number &&
3730                     info->devfn == pdev->devfn) {
3731                         list_del(&info->link);
3732                         list_del(&info->global);
3733                         if (info->dev)
3734                                 info->dev->dev.archdata.iommu = NULL;
3735                         spin_unlock_irqrestore(&device_domain_lock, flags);
3736
3737                         iommu_disable_dev_iotlb(info);
3738                         iommu_detach_dev(iommu, info->bus, info->devfn);
3739                         iommu_detach_dependent_devices(iommu, pdev);
3740                         free_devinfo_mem(info);
3741
3742                         spin_lock_irqsave(&device_domain_lock, flags);
3743
3744                         if (found)
3745                                 break;
3746                         else
3747                                 continue;
3748                 }
3749
3750                 /* if there is no other devices under the same iommu
3751                  * owned by this domain, clear this iommu in iommu_bmp
3752                  * update iommu count and coherency
3753                  */
3754                 if (iommu == device_to_iommu(info->segment, info->bus,
3755                                             info->devfn))
3756                         found = 1;
3757         }
3758
3759         spin_unlock_irqrestore(&device_domain_lock, flags);
3760
3761         if (found == 0) {
3762                 unsigned long tmp_flags;
3763                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3764                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3765                 domain->iommu_count--;
3766                 domain_update_iommu_cap(domain);
3767                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3768
3769                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3770                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3771                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3772                         clear_bit(domain->id, iommu->domain_ids);
3773                         iommu->domains[domain->id] = NULL;
3774                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3775                 }
3776         }
3777 }
3778
3779 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3780 {
3781         struct device_domain_info *info;
3782         struct intel_iommu *iommu;
3783         unsigned long flags1, flags2;
3784
3785         spin_lock_irqsave(&device_domain_lock, flags1);
3786         while (!list_empty(&domain->devices)) {
3787                 info = list_entry(domain->devices.next,
3788                         struct device_domain_info, link);
3789                 list_del(&info->link);
3790                 list_del(&info->global);
3791                 if (info->dev)
3792                         info->dev->dev.archdata.iommu = NULL;
3793
3794                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3795
3796                 iommu_disable_dev_iotlb(info);
3797                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3798                 iommu_detach_dev(iommu, info->bus, info->devfn);
3799                 iommu_detach_dependent_devices(iommu, info->dev);
3800
3801                 /* clear this iommu in iommu_bmp, update iommu count
3802                  * and capabilities
3803                  */
3804                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3805                 if (test_and_clear_bit(iommu->seq_id,
3806                                        domain->iommu_bmp)) {
3807                         domain->iommu_count--;
3808                         domain_update_iommu_cap(domain);
3809                 }
3810                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3811
3812                 free_devinfo_mem(info);
3813                 spin_lock_irqsave(&device_domain_lock, flags1);
3814         }
3815         spin_unlock_irqrestore(&device_domain_lock, flags1);
3816 }
3817
3818 /* domain id for virtual machine, it won't be set in context */
3819 static unsigned long vm_domid;
3820
3821 static struct dmar_domain *iommu_alloc_vm_domain(void)
3822 {
3823         struct dmar_domain *domain;
3824
3825         domain = alloc_domain_mem();
3826         if (!domain)
3827                 return NULL;
3828
3829         domain->id = vm_domid++;
3830         domain->nid = -1;
3831         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3832         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3833
3834         return domain;
3835 }
3836
3837 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3838 {
3839         int adjust_width;
3840
3841         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3842         spin_lock_init(&domain->iommu_lock);
3843
3844         domain_reserve_special_ranges(domain);
3845
3846         /* calculate AGAW */
3847         domain->gaw = guest_width;
3848         adjust_width = guestwidth_to_adjustwidth(guest_width);
3849         domain->agaw = width_to_agaw(adjust_width);
3850
3851         INIT_LIST_HEAD(&domain->devices);
3852
3853         domain->iommu_count = 0;
3854         domain->iommu_coherency = 0;
3855         domain->iommu_snooping = 0;
3856         domain->iommu_superpage = 0;
3857         domain->max_addr = 0;
3858         domain->nid = -1;
3859
3860         /* always allocate the top pgd */
3861         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3862         if (!domain->pgd)
3863                 return -ENOMEM;
3864         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3865         return 0;
3866 }
3867
3868 static void iommu_free_vm_domain(struct dmar_domain *domain)
3869 {
3870         unsigned long flags;
3871         struct dmar_drhd_unit *drhd;
3872         struct intel_iommu *iommu;
3873         unsigned long i;
3874         unsigned long ndomains;
3875
3876         for_each_drhd_unit(drhd) {
3877                 if (drhd->ignored)
3878                         continue;
3879                 iommu = drhd->iommu;
3880
3881                 ndomains = cap_ndoms(iommu->cap);
3882                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3883                         if (iommu->domains[i] == domain) {
3884                                 spin_lock_irqsave(&iommu->lock, flags);
3885                                 clear_bit(i, iommu->domain_ids);
3886                                 iommu->domains[i] = NULL;
3887                                 spin_unlock_irqrestore(&iommu->lock, flags);
3888                                 break;
3889                         }
3890                 }
3891         }
3892 }
3893
3894 static void vm_domain_exit(struct dmar_domain *domain)
3895 {
3896         /* Domain 0 is reserved, so dont process it */
3897         if (!domain)
3898                 return;
3899
3900         vm_domain_remove_all_dev_info(domain);
3901         /* destroy iovas */
3902         put_iova_domain(&domain->iovad);
3903
3904         /* clear ptes */
3905         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3906
3907         /* free page tables */
3908         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3909
3910         iommu_free_vm_domain(domain);
3911         free_domain_mem(domain);
3912 }
3913
3914 static int intel_iommu_domain_init(struct iommu_domain *domain)
3915 {
3916         struct dmar_domain *dmar_domain;
3917
3918         dmar_domain = iommu_alloc_vm_domain();
3919         if (!dmar_domain) {
3920                 printk(KERN_ERR
3921                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3922                 return -ENOMEM;
3923         }
3924         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3925                 printk(KERN_ERR
3926                         "intel_iommu_domain_init() failed\n");
3927                 vm_domain_exit(dmar_domain);
3928                 return -ENOMEM;
3929         }
3930         domain_update_iommu_cap(dmar_domain);
3931         domain->priv = dmar_domain;
3932
3933         return 0;
3934 }
3935
3936 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3937 {
3938         struct dmar_domain *dmar_domain = domain->priv;
3939
3940         domain->priv = NULL;
3941         vm_domain_exit(dmar_domain);
3942 }
3943
3944 static int intel_iommu_attach_device(struct iommu_domain *domain,
3945                                      struct device *dev)
3946 {
3947         struct dmar_domain *dmar_domain = domain->priv;
3948         struct pci_dev *pdev = to_pci_dev(dev);
3949         struct intel_iommu *iommu;
3950         int addr_width;
3951
3952         /* normally pdev is not mapped */
3953         if (unlikely(domain_context_mapped(pdev))) {
3954                 struct dmar_domain *old_domain;
3955
3956                 old_domain = find_domain(pdev);
3957                 if (old_domain) {
3958                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3959                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3960                                 domain_remove_one_dev_info(old_domain, pdev);
3961                         else
3962                                 domain_remove_dev_info(old_domain);
3963                 }
3964         }
3965
3966         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3967                                 pdev->devfn);
3968         if (!iommu)
3969                 return -ENODEV;
3970
3971         /* check if this iommu agaw is sufficient for max mapped address */
3972         addr_width = agaw_to_width(iommu->agaw);
3973         if (addr_width > cap_mgaw(iommu->cap))
3974                 addr_width = cap_mgaw(iommu->cap);
3975
3976         if (dmar_domain->max_addr > (1LL << addr_width)) {
3977                 printk(KERN_ERR "%s: iommu width (%d) is not "
3978                        "sufficient for the mapped address (%llx)\n",
3979                        __func__, addr_width, dmar_domain->max_addr);
3980                 return -EFAULT;
3981         }
3982         dmar_domain->gaw = addr_width;
3983
3984         /*
3985          * Knock out extra levels of page tables if necessary
3986          */
3987         while (iommu->agaw < dmar_domain->agaw) {
3988                 struct dma_pte *pte;
3989
3990                 pte = dmar_domain->pgd;
3991                 if (dma_pte_present(pte)) {
3992                         dmar_domain->pgd = (struct dma_pte *)
3993                                 phys_to_virt(dma_pte_addr(pte));
3994                         free_pgtable_page(pte);
3995                 }
3996                 dmar_domain->agaw--;
3997         }
3998
3999         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4000 }
4001
4002 static void intel_iommu_detach_device(struct iommu_domain *domain,
4003                                       struct device *dev)
4004 {
4005         struct dmar_domain *dmar_domain = domain->priv;
4006         struct pci_dev *pdev = to_pci_dev(dev);
4007
4008         domain_remove_one_dev_info(dmar_domain, pdev);
4009 }
4010
4011 static int intel_iommu_map(struct iommu_domain *domain,
4012                            unsigned long iova, phys_addr_t hpa,
4013                            size_t size, int iommu_prot)
4014 {
4015         struct dmar_domain *dmar_domain = domain->priv;
4016         u64 max_addr;
4017         int prot = 0;
4018         int ret;
4019
4020         if (iommu_prot & IOMMU_READ)
4021                 prot |= DMA_PTE_READ;
4022         if (iommu_prot & IOMMU_WRITE)
4023                 prot |= DMA_PTE_WRITE;
4024         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4025                 prot |= DMA_PTE_SNP;
4026
4027         max_addr = iova + size;
4028         if (dmar_domain->max_addr < max_addr) {
4029                 u64 end;
4030
4031                 /* check if minimum agaw is sufficient for mapped address */
4032                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4033                 if (end < max_addr) {
4034                         printk(KERN_ERR "%s: iommu width (%d) is not "
4035                                "sufficient for the mapped address (%llx)\n",
4036                                __func__, dmar_domain->gaw, max_addr);
4037                         return -EFAULT;
4038                 }
4039                 dmar_domain->max_addr = max_addr;
4040         }
4041         /* Round up size to next multiple of PAGE_SIZE, if it and
4042            the low bits of hpa would take us onto the next page */
4043         size = aligned_nrpages(hpa, size);
4044         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4045                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4046         return ret;
4047 }
4048
4049 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4050                              unsigned long iova, size_t size)
4051 {
4052         struct dmar_domain *dmar_domain = domain->priv;
4053         int order;
4054
4055         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4056                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4057
4058         if (dmar_domain->max_addr == iova + size)
4059                 dmar_domain->max_addr = iova;
4060
4061         return PAGE_SIZE << order;
4062 }
4063
4064 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4065                                             unsigned long iova)
4066 {
4067         struct dmar_domain *dmar_domain = domain->priv;
4068         struct dma_pte *pte;
4069         u64 phys = 0;
4070
4071         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4072         if (pte)
4073                 phys = dma_pte_addr(pte);
4074
4075         return phys;
4076 }
4077
4078 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4079                                       unsigned long cap)
4080 {
4081         struct dmar_domain *dmar_domain = domain->priv;
4082
4083         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4084                 return dmar_domain->iommu_snooping;
4085         if (cap == IOMMU_CAP_INTR_REMAP)
4086                 return irq_remapping_enabled;
4087
4088         return 0;
4089 }
4090
4091 /*
4092  * Group numbers are arbitrary.  Device with the same group number
4093  * indicate the iommu cannot differentiate between them.  To avoid
4094  * tracking used groups we just use the seg|bus|devfn of the lowest
4095  * level we're able to differentiate devices
4096  */
4097 static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4098 {
4099         struct pci_dev *pdev = to_pci_dev(dev);
4100         struct pci_dev *bridge;
4101         union {
4102                 struct {
4103                         u8 devfn;
4104                         u8 bus;
4105                         u16 segment;
4106                 } pci;
4107                 u32 group;
4108         } id;
4109
4110         if (iommu_no_mapping(dev))
4111                 return -ENODEV;
4112
4113         id.pci.segment = pci_domain_nr(pdev->bus);
4114         id.pci.bus = pdev->bus->number;
4115         id.pci.devfn = pdev->devfn;
4116
4117         if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4118                 return -ENODEV;
4119
4120         bridge = pci_find_upstream_pcie_bridge(pdev);
4121         if (bridge) {
4122                 if (pci_is_pcie(bridge)) {
4123                         id.pci.bus = bridge->subordinate->number;
4124                         id.pci.devfn = 0;
4125                 } else {
4126                         id.pci.bus = bridge->bus->number;
4127                         id.pci.devfn = bridge->devfn;
4128                 }
4129         }
4130
4131         if (!pdev->is_virtfn && iommu_group_mf)
4132                 id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4133
4134         *groupid = id.group;
4135
4136         return 0;
4137 }
4138
4139 static struct iommu_ops intel_iommu_ops = {
4140         .domain_init    = intel_iommu_domain_init,
4141         .domain_destroy = intel_iommu_domain_destroy,
4142         .attach_dev     = intel_iommu_attach_device,
4143         .detach_dev     = intel_iommu_detach_device,
4144         .map            = intel_iommu_map,
4145         .unmap          = intel_iommu_unmap,
4146         .iova_to_phys   = intel_iommu_iova_to_phys,
4147         .domain_has_cap = intel_iommu_domain_has_cap,
4148         .device_group   = intel_iommu_device_group,
4149         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4150 };
4151
4152 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4153 {
4154         /*
4155          * Mobile 4 Series Chipset neglects to set RWBF capability,
4156          * but needs it:
4157          */
4158         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4159         rwbf_quirk = 1;
4160
4161         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4162         if (dev->revision == 0x07) {
4163                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4164                 dmar_map_gfx = 0;
4165         }
4166 }
4167
4168 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4169
4170 #define GGC 0x52
4171 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4172 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4173 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4174 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4175 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4176 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4177 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4178 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4179
4180 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4181 {
4182         unsigned short ggc;
4183
4184         if (pci_read_config_word(dev, GGC, &ggc))
4185                 return;
4186
4187         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4188                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4189                 dmar_map_gfx = 0;
4190         } else if (dmar_map_gfx) {
4191                 /* we have to ensure the gfx device is idle before we flush */
4192                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4193                 intel_iommu_strict = 1;
4194        }
4195 }
4196 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4199 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4200
4201 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4202    ISOCH DMAR unit for the Azalia sound device, but not give it any
4203    TLB entries, which causes it to deadlock. Check for that.  We do
4204    this in a function called from init_dmars(), instead of in a PCI
4205    quirk, because we don't want to print the obnoxious "BIOS broken"
4206    message if VT-d is actually disabled.
4207 */
4208 static void __init check_tylersburg_isoch(void)
4209 {
4210         struct pci_dev *pdev;
4211         uint32_t vtisochctrl;
4212
4213         /* If there's no Azalia in the system anyway, forget it. */
4214         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4215         if (!pdev)
4216                 return;
4217         pci_dev_put(pdev);
4218
4219         /* System Management Registers. Might be hidden, in which case
4220            we can't do the sanity check. But that's OK, because the
4221            known-broken BIOSes _don't_ actually hide it, so far. */
4222         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4223         if (!pdev)
4224                 return;
4225
4226         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4227                 pci_dev_put(pdev);
4228                 return;
4229         }
4230
4231         pci_dev_put(pdev);
4232
4233         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4234         if (vtisochctrl & 1)
4235                 return;
4236
4237         /* Drop all bits other than the number of TLB entries */
4238         vtisochctrl &= 0x1c;
4239
4240         /* If we have the recommended number of TLB entries (16), fine. */
4241         if (vtisochctrl == 0x10)
4242                 return;
4243
4244         /* Zero TLB entries? You get to ride the short bus to school. */
4245         if (!vtisochctrl) {
4246                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4247                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4248                      dmi_get_system_info(DMI_BIOS_VENDOR),
4249                      dmi_get_system_info(DMI_BIOS_VERSION),
4250                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4251                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4252                 return;
4253         }
4254         
4255         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4256                vtisochctrl);
4257 }