]> Pileus Git - ~andy/linux/blob - drivers/iommu/intel-iommu.c
Merge branch 'late/soc' into devel-late
[~andy/linux] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 /* define the limit of IOMMUs supported in each domain */
359 #ifdef  CONFIG_X86
360 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
361 #else
362 # define        IOMMU_UNITS_SUPPORTED   64
363 #endif
364
365 struct dmar_domain {
366         int     id;                     /* domain id */
367         int     nid;                    /* node id */
368         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
369                                         /* bitmap of iommus this domain uses*/
370
371         struct list_head devices;       /* all devices' list */
372         struct iova_domain iovad;       /* iova's that belong to this domain */
373
374         struct dma_pte  *pgd;           /* virtual address */
375         int             gaw;            /* max guest address width */
376
377         /* adjusted guest address width, 0 is level 2 30-bit */
378         int             agaw;
379
380         int             flags;          /* flags to find out type of domain */
381
382         int             iommu_coherency;/* indicate coherency of iommu access */
383         int             iommu_snooping; /* indicate snooping control feature*/
384         int             iommu_count;    /* reference count of iommu */
385         int             iommu_superpage;/* Level of superpages supported:
386                                            0 == 4KiB (no superpages), 1 == 2MiB,
387                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
388         spinlock_t      iommu_lock;     /* protect iommu set in domain */
389         u64             max_addr;       /* maximum mapped address */
390 };
391
392 /* PCI domain-device relationship */
393 struct device_domain_info {
394         struct list_head link;  /* link to domain siblings */
395         struct list_head global; /* link to global list */
396         int segment;            /* PCI domain */
397         u8 bus;                 /* PCI bus number */
398         u8 devfn;               /* PCI devfn number */
399         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
400         struct intel_iommu *iommu; /* IOMMU used by this device */
401         struct dmar_domain *domain; /* pointer to domain */
402 };
403
404 static void flush_unmaps_timeout(unsigned long data);
405
406 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
407
408 #define HIGH_WATER_MARK 250
409 struct deferred_flush_tables {
410         int next;
411         struct iova *iova[HIGH_WATER_MARK];
412         struct dmar_domain *domain[HIGH_WATER_MARK];
413 };
414
415 static struct deferred_flush_tables *deferred_flush;
416
417 /* bitmap for indexing intel_iommus */
418 static int g_num_of_iommus;
419
420 static DEFINE_SPINLOCK(async_umap_flush_lock);
421 static LIST_HEAD(unmaps_to_do);
422
423 static int timer_on;
424 static long list_size;
425
426 static void domain_remove_dev_info(struct dmar_domain *domain);
427
428 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
429 int dmar_disabled = 0;
430 #else
431 int dmar_disabled = 1;
432 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
433
434 int intel_iommu_enabled = 0;
435 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
436
437 static int dmar_map_gfx = 1;
438 static int dmar_forcedac;
439 static int intel_iommu_strict;
440 static int intel_iommu_superpage = 1;
441
442 int intel_iommu_gfx_mapped;
443 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
444
445 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
446 static DEFINE_SPINLOCK(device_domain_lock);
447 static LIST_HEAD(device_domain_list);
448
449 static struct iommu_ops intel_iommu_ops;
450
451 static int __init intel_iommu_setup(char *str)
452 {
453         if (!str)
454                 return -EINVAL;
455         while (*str) {
456                 if (!strncmp(str, "on", 2)) {
457                         dmar_disabled = 0;
458                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
459                 } else if (!strncmp(str, "off", 3)) {
460                         dmar_disabled = 1;
461                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
462                 } else if (!strncmp(str, "igfx_off", 8)) {
463                         dmar_map_gfx = 0;
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable GFX device mapping\n");
466                 } else if (!strncmp(str, "forcedac", 8)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
469                         dmar_forcedac = 1;
470                 } else if (!strncmp(str, "strict", 6)) {
471                         printk(KERN_INFO
472                                 "Intel-IOMMU: disable batched IOTLB flush\n");
473                         intel_iommu_strict = 1;
474                 } else if (!strncmp(str, "sp_off", 6)) {
475                         printk(KERN_INFO
476                                 "Intel-IOMMU: disable supported super page\n");
477                         intel_iommu_superpage = 0;
478                 }
479
480                 str += strcspn(str, ",");
481                 while (*str == ',')
482                         str++;
483         }
484         return 0;
485 }
486 __setup("intel_iommu=", intel_iommu_setup);
487
488 static struct kmem_cache *iommu_domain_cache;
489 static struct kmem_cache *iommu_devinfo_cache;
490 static struct kmem_cache *iommu_iova_cache;
491
492 static inline void *alloc_pgtable_page(int node)
493 {
494         struct page *page;
495         void *vaddr = NULL;
496
497         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
498         if (page)
499                 vaddr = page_address(page);
500         return vaddr;
501 }
502
503 static inline void free_pgtable_page(void *vaddr)
504 {
505         free_page((unsigned long)vaddr);
506 }
507
508 static inline void *alloc_domain_mem(void)
509 {
510         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
511 }
512
513 static void free_domain_mem(void *vaddr)
514 {
515         kmem_cache_free(iommu_domain_cache, vaddr);
516 }
517
518 static inline void * alloc_devinfo_mem(void)
519 {
520         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
521 }
522
523 static inline void free_devinfo_mem(void *vaddr)
524 {
525         kmem_cache_free(iommu_devinfo_cache, vaddr);
526 }
527
528 struct iova *alloc_iova_mem(void)
529 {
530         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
531 }
532
533 void free_iova_mem(struct iova *iova)
534 {
535         kmem_cache_free(iommu_iova_cache, iova);
536 }
537
538
539 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
540 {
541         unsigned long sagaw;
542         int agaw = -1;
543
544         sagaw = cap_sagaw(iommu->cap);
545         for (agaw = width_to_agaw(max_gaw);
546              agaw >= 0; agaw--) {
547                 if (test_bit(agaw, &sagaw))
548                         break;
549         }
550
551         return agaw;
552 }
553
554 /*
555  * Calculate max SAGAW for each iommu.
556  */
557 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
558 {
559         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
560 }
561
562 /*
563  * calculate agaw for each iommu.
564  * "SAGAW" may be different across iommus, use a default agaw, and
565  * get a supported less agaw for iommus that don't support the default agaw.
566  */
567 int iommu_calculate_agaw(struct intel_iommu *iommu)
568 {
569         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
570 }
571
572 /* This functionin only returns single iommu in a domain */
573 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
574 {
575         int iommu_id;
576
577         /* si_domain and vm domain should not get here. */
578         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
579         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
580
581         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
582         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
583                 return NULL;
584
585         return g_iommus[iommu_id];
586 }
587
588 static void domain_update_iommu_coherency(struct dmar_domain *domain)
589 {
590         int i;
591
592         domain->iommu_coherency = 1;
593
594         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
595                 if (!ecap_coherent(g_iommus[i]->ecap)) {
596                         domain->iommu_coherency = 0;
597                         break;
598                 }
599         }
600 }
601
602 static void domain_update_iommu_snooping(struct dmar_domain *domain)
603 {
604         int i;
605
606         domain->iommu_snooping = 1;
607
608         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
609                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
610                         domain->iommu_snooping = 0;
611                         break;
612                 }
613         }
614 }
615
616 static void domain_update_iommu_superpage(struct dmar_domain *domain)
617 {
618         struct dmar_drhd_unit *drhd;
619         struct intel_iommu *iommu = NULL;
620         int mask = 0xf;
621
622         if (!intel_iommu_superpage) {
623                 domain->iommu_superpage = 0;
624                 return;
625         }
626
627         /* set iommu_superpage to the smallest common denominator */
628         for_each_active_iommu(iommu, drhd) {
629                 mask &= cap_super_page_val(iommu->cap);
630                 if (!mask) {
631                         break;
632                 }
633         }
634         domain->iommu_superpage = fls(mask);
635 }
636
637 /* Some capabilities may be different across iommus */
638 static void domain_update_iommu_cap(struct dmar_domain *domain)
639 {
640         domain_update_iommu_coherency(domain);
641         domain_update_iommu_snooping(domain);
642         domain_update_iommu_superpage(domain);
643 }
644
645 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
646 {
647         struct dmar_drhd_unit *drhd = NULL;
648         int i;
649
650         for_each_drhd_unit(drhd) {
651                 if (drhd->ignored)
652                         continue;
653                 if (segment != drhd->segment)
654                         continue;
655
656                 for (i = 0; i < drhd->devices_cnt; i++) {
657                         if (drhd->devices[i] &&
658                             drhd->devices[i]->bus->number == bus &&
659                             drhd->devices[i]->devfn == devfn)
660                                 return drhd->iommu;
661                         if (drhd->devices[i] &&
662                             drhd->devices[i]->subordinate &&
663                             drhd->devices[i]->subordinate->number <= bus &&
664                             drhd->devices[i]->subordinate->subordinate >= bus)
665                                 return drhd->iommu;
666                 }
667
668                 if (drhd->include_all)
669                         return drhd->iommu;
670         }
671
672         return NULL;
673 }
674
675 static void domain_flush_cache(struct dmar_domain *domain,
676                                void *addr, int size)
677 {
678         if (!domain->iommu_coherency)
679                 clflush_cache_range(addr, size);
680 }
681
682 /* Gets context entry for a given bus and devfn */
683 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
684                 u8 bus, u8 devfn)
685 {
686         struct root_entry *root;
687         struct context_entry *context;
688         unsigned long phy_addr;
689         unsigned long flags;
690
691         spin_lock_irqsave(&iommu->lock, flags);
692         root = &iommu->root_entry[bus];
693         context = get_context_addr_from_root(root);
694         if (!context) {
695                 context = (struct context_entry *)
696                                 alloc_pgtable_page(iommu->node);
697                 if (!context) {
698                         spin_unlock_irqrestore(&iommu->lock, flags);
699                         return NULL;
700                 }
701                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
702                 phy_addr = virt_to_phys((void *)context);
703                 set_root_value(root, phy_addr);
704                 set_root_present(root);
705                 __iommu_flush_cache(iommu, root, sizeof(*root));
706         }
707         spin_unlock_irqrestore(&iommu->lock, flags);
708         return &context[devfn];
709 }
710
711 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
712 {
713         struct root_entry *root;
714         struct context_entry *context;
715         int ret;
716         unsigned long flags;
717
718         spin_lock_irqsave(&iommu->lock, flags);
719         root = &iommu->root_entry[bus];
720         context = get_context_addr_from_root(root);
721         if (!context) {
722                 ret = 0;
723                 goto out;
724         }
725         ret = context_present(&context[devfn]);
726 out:
727         spin_unlock_irqrestore(&iommu->lock, flags);
728         return ret;
729 }
730
731 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
732 {
733         struct root_entry *root;
734         struct context_entry *context;
735         unsigned long flags;
736
737         spin_lock_irqsave(&iommu->lock, flags);
738         root = &iommu->root_entry[bus];
739         context = get_context_addr_from_root(root);
740         if (context) {
741                 context_clear_entry(&context[devfn]);
742                 __iommu_flush_cache(iommu, &context[devfn], \
743                         sizeof(*context));
744         }
745         spin_unlock_irqrestore(&iommu->lock, flags);
746 }
747
748 static void free_context_table(struct intel_iommu *iommu)
749 {
750         struct root_entry *root;
751         int i;
752         unsigned long flags;
753         struct context_entry *context;
754
755         spin_lock_irqsave(&iommu->lock, flags);
756         if (!iommu->root_entry) {
757                 goto out;
758         }
759         for (i = 0; i < ROOT_ENTRY_NR; i++) {
760                 root = &iommu->root_entry[i];
761                 context = get_context_addr_from_root(root);
762                 if (context)
763                         free_pgtable_page(context);
764         }
765         free_pgtable_page(iommu->root_entry);
766         iommu->root_entry = NULL;
767 out:
768         spin_unlock_irqrestore(&iommu->lock, flags);
769 }
770
771 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
772                                       unsigned long pfn, int target_level)
773 {
774         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
775         struct dma_pte *parent, *pte = NULL;
776         int level = agaw_to_level(domain->agaw);
777         int offset;
778
779         BUG_ON(!domain->pgd);
780         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
781         parent = domain->pgd;
782
783         while (level > 0) {
784                 void *tmp_page;
785
786                 offset = pfn_level_offset(pfn, level);
787                 pte = &parent[offset];
788                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
789                         break;
790                 if (level == target_level)
791                         break;
792
793                 if (!dma_pte_present(pte)) {
794                         uint64_t pteval;
795
796                         tmp_page = alloc_pgtable_page(domain->nid);
797
798                         if (!tmp_page)
799                                 return NULL;
800
801                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
802                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
803                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
804                                 /* Someone else set it while we were thinking; use theirs. */
805                                 free_pgtable_page(tmp_page);
806                         } else {
807                                 dma_pte_addr(pte);
808                                 domain_flush_cache(domain, pte, sizeof(*pte));
809                         }
810                 }
811                 parent = phys_to_virt(dma_pte_addr(pte));
812                 level--;
813         }
814
815         return pte;
816 }
817
818
819 /* return address's pte at specific level */
820 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
821                                          unsigned long pfn,
822                                          int level, int *large_page)
823 {
824         struct dma_pte *parent, *pte = NULL;
825         int total = agaw_to_level(domain->agaw);
826         int offset;
827
828         parent = domain->pgd;
829         while (level <= total) {
830                 offset = pfn_level_offset(pfn, total);
831                 pte = &parent[offset];
832                 if (level == total)
833                         return pte;
834
835                 if (!dma_pte_present(pte)) {
836                         *large_page = total;
837                         break;
838                 }
839
840                 if (pte->val & DMA_PTE_LARGE_PAGE) {
841                         *large_page = total;
842                         return pte;
843                 }
844
845                 parent = phys_to_virt(dma_pte_addr(pte));
846                 total--;
847         }
848         return NULL;
849 }
850
851 /* clear last level pte, a tlb flush should be followed */
852 static int dma_pte_clear_range(struct dmar_domain *domain,
853                                 unsigned long start_pfn,
854                                 unsigned long last_pfn)
855 {
856         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
857         unsigned int large_page = 1;
858         struct dma_pte *first_pte, *pte;
859         int order;
860
861         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
862         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
863         BUG_ON(start_pfn > last_pfn);
864
865         /* we don't need lock here; nobody else touches the iova range */
866         do {
867                 large_page = 1;
868                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
869                 if (!pte) {
870                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
871                         continue;
872                 }
873                 do {
874                         dma_clear_pte(pte);
875                         start_pfn += lvl_to_nr_pages(large_page);
876                         pte++;
877                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
878
879                 domain_flush_cache(domain, first_pte,
880                                    (void *)pte - (void *)first_pte);
881
882         } while (start_pfn && start_pfn <= last_pfn);
883
884         order = (large_page - 1) * 9;
885         return order;
886 }
887
888 /* free page table pages. last level pte should already be cleared */
889 static void dma_pte_free_pagetable(struct dmar_domain *domain,
890                                    unsigned long start_pfn,
891                                    unsigned long last_pfn)
892 {
893         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
894         struct dma_pte *first_pte, *pte;
895         int total = agaw_to_level(domain->agaw);
896         int level;
897         unsigned long tmp;
898         int large_page = 2;
899
900         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
901         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
902         BUG_ON(start_pfn > last_pfn);
903
904         /* We don't need lock here; nobody else touches the iova range */
905         level = 2;
906         while (level <= total) {
907                 tmp = align_to_level(start_pfn, level);
908
909                 /* If we can't even clear one PTE at this level, we're done */
910                 if (tmp + level_size(level) - 1 > last_pfn)
911                         return;
912
913                 do {
914                         large_page = level;
915                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
916                         if (large_page > level)
917                                 level = large_page + 1;
918                         if (!pte) {
919                                 tmp = align_to_level(tmp + 1, level + 1);
920                                 continue;
921                         }
922                         do {
923                                 if (dma_pte_present(pte)) {
924                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
925                                         dma_clear_pte(pte);
926                                 }
927                                 pte++;
928                                 tmp += level_size(level);
929                         } while (!first_pte_in_page(pte) &&
930                                  tmp + level_size(level) - 1 <= last_pfn);
931
932                         domain_flush_cache(domain, first_pte,
933                                            (void *)pte - (void *)first_pte);
934                         
935                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
936                 level++;
937         }
938         /* free pgd */
939         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
940                 free_pgtable_page(domain->pgd);
941                 domain->pgd = NULL;
942         }
943 }
944
945 /* iommu handling */
946 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
947 {
948         struct root_entry *root;
949         unsigned long flags;
950
951         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
952         if (!root)
953                 return -ENOMEM;
954
955         __iommu_flush_cache(iommu, root, ROOT_SIZE);
956
957         spin_lock_irqsave(&iommu->lock, flags);
958         iommu->root_entry = root;
959         spin_unlock_irqrestore(&iommu->lock, flags);
960
961         return 0;
962 }
963
964 static void iommu_set_root_entry(struct intel_iommu *iommu)
965 {
966         void *addr;
967         u32 sts;
968         unsigned long flag;
969
970         addr = iommu->root_entry;
971
972         raw_spin_lock_irqsave(&iommu->register_lock, flag);
973         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
974
975         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
976
977         /* Make sure hardware complete it */
978         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
979                       readl, (sts & DMA_GSTS_RTPS), sts);
980
981         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
982 }
983
984 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
985 {
986         u32 val;
987         unsigned long flag;
988
989         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
990                 return;
991
992         raw_spin_lock_irqsave(&iommu->register_lock, flag);
993         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
994
995         /* Make sure hardware complete it */
996         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
997                       readl, (!(val & DMA_GSTS_WBFS)), val);
998
999         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1000 }
1001
1002 /* return value determine if we need a write buffer flush */
1003 static void __iommu_flush_context(struct intel_iommu *iommu,
1004                                   u16 did, u16 source_id, u8 function_mask,
1005                                   u64 type)
1006 {
1007         u64 val = 0;
1008         unsigned long flag;
1009
1010         switch (type) {
1011         case DMA_CCMD_GLOBAL_INVL:
1012                 val = DMA_CCMD_GLOBAL_INVL;
1013                 break;
1014         case DMA_CCMD_DOMAIN_INVL:
1015                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1016                 break;
1017         case DMA_CCMD_DEVICE_INVL:
1018                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1019                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1020                 break;
1021         default:
1022                 BUG();
1023         }
1024         val |= DMA_CCMD_ICC;
1025
1026         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1027         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1028
1029         /* Make sure hardware complete it */
1030         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1031                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1032
1033         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1034 }
1035
1036 /* return value determine if we need a write buffer flush */
1037 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1038                                 u64 addr, unsigned int size_order, u64 type)
1039 {
1040         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1041         u64 val = 0, val_iva = 0;
1042         unsigned long flag;
1043
1044         switch (type) {
1045         case DMA_TLB_GLOBAL_FLUSH:
1046                 /* global flush doesn't need set IVA_REG */
1047                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1048                 break;
1049         case DMA_TLB_DSI_FLUSH:
1050                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1051                 break;
1052         case DMA_TLB_PSI_FLUSH:
1053                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054                 /* Note: always flush non-leaf currently */
1055                 val_iva = size_order | addr;
1056                 break;
1057         default:
1058                 BUG();
1059         }
1060         /* Note: set drain read/write */
1061 #if 0
1062         /*
1063          * This is probably to be super secure.. Looks like we can
1064          * ignore it without any impact.
1065          */
1066         if (cap_read_drain(iommu->cap))
1067                 val |= DMA_TLB_READ_DRAIN;
1068 #endif
1069         if (cap_write_drain(iommu->cap))
1070                 val |= DMA_TLB_WRITE_DRAIN;
1071
1072         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1073         /* Note: Only uses first TLB reg currently */
1074         if (val_iva)
1075                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1076         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1077
1078         /* Make sure hardware complete it */
1079         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1080                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1081
1082         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1083
1084         /* check IOTLB invalidation granularity */
1085         if (DMA_TLB_IAIG(val) == 0)
1086                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1087         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1088                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1089                         (unsigned long long)DMA_TLB_IIRG(type),
1090                         (unsigned long long)DMA_TLB_IAIG(val));
1091 }
1092
1093 static struct device_domain_info *iommu_support_dev_iotlb(
1094         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1095 {
1096         int found = 0;
1097         unsigned long flags;
1098         struct device_domain_info *info;
1099         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1100
1101         if (!ecap_dev_iotlb_support(iommu->ecap))
1102                 return NULL;
1103
1104         if (!iommu->qi)
1105                 return NULL;
1106
1107         spin_lock_irqsave(&device_domain_lock, flags);
1108         list_for_each_entry(info, &domain->devices, link)
1109                 if (info->bus == bus && info->devfn == devfn) {
1110                         found = 1;
1111                         break;
1112                 }
1113         spin_unlock_irqrestore(&device_domain_lock, flags);
1114
1115         if (!found || !info->dev)
1116                 return NULL;
1117
1118         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1119                 return NULL;
1120
1121         if (!dmar_find_matched_atsr_unit(info->dev))
1122                 return NULL;
1123
1124         info->iommu = iommu;
1125
1126         return info;
1127 }
1128
1129 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1130 {
1131         if (!info)
1132                 return;
1133
1134         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1135 }
1136
1137 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1138 {
1139         if (!info->dev || !pci_ats_enabled(info->dev))
1140                 return;
1141
1142         pci_disable_ats(info->dev);
1143 }
1144
1145 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1146                                   u64 addr, unsigned mask)
1147 {
1148         u16 sid, qdep;
1149         unsigned long flags;
1150         struct device_domain_info *info;
1151
1152         spin_lock_irqsave(&device_domain_lock, flags);
1153         list_for_each_entry(info, &domain->devices, link) {
1154                 if (!info->dev || !pci_ats_enabled(info->dev))
1155                         continue;
1156
1157                 sid = info->bus << 8 | info->devfn;
1158                 qdep = pci_ats_queue_depth(info->dev);
1159                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1160         }
1161         spin_unlock_irqrestore(&device_domain_lock, flags);
1162 }
1163
1164 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1165                                   unsigned long pfn, unsigned int pages, int map)
1166 {
1167         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1168         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1169
1170         BUG_ON(pages == 0);
1171
1172         /*
1173          * Fallback to domain selective flush if no PSI support or the size is
1174          * too big.
1175          * PSI requires page size to be 2 ^ x, and the base address is naturally
1176          * aligned to the size
1177          */
1178         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1179                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1180                                                 DMA_TLB_DSI_FLUSH);
1181         else
1182                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1183                                                 DMA_TLB_PSI_FLUSH);
1184
1185         /*
1186          * In caching mode, changes of pages from non-present to present require
1187          * flush. However, device IOTLB doesn't need to be flushed in this case.
1188          */
1189         if (!cap_caching_mode(iommu->cap) || !map)
1190                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1191 }
1192
1193 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1194 {
1195         u32 pmen;
1196         unsigned long flags;
1197
1198         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1199         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1200         pmen &= ~DMA_PMEN_EPM;
1201         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1202
1203         /* wait for the protected region status bit to clear */
1204         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1205                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1206
1207         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1208 }
1209
1210 static int iommu_enable_translation(struct intel_iommu *iommu)
1211 {
1212         u32 sts;
1213         unsigned long flags;
1214
1215         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1216         iommu->gcmd |= DMA_GCMD_TE;
1217         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1218
1219         /* Make sure hardware complete it */
1220         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1221                       readl, (sts & DMA_GSTS_TES), sts);
1222
1223         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1224         return 0;
1225 }
1226
1227 static int iommu_disable_translation(struct intel_iommu *iommu)
1228 {
1229         u32 sts;
1230         unsigned long flag;
1231
1232         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1233         iommu->gcmd &= ~DMA_GCMD_TE;
1234         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1235
1236         /* Make sure hardware complete it */
1237         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1238                       readl, (!(sts & DMA_GSTS_TES)), sts);
1239
1240         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1241         return 0;
1242 }
1243
1244
1245 static int iommu_init_domains(struct intel_iommu *iommu)
1246 {
1247         unsigned long ndomains;
1248         unsigned long nlongs;
1249
1250         ndomains = cap_ndoms(iommu->cap);
1251         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1252                         ndomains);
1253         nlongs = BITS_TO_LONGS(ndomains);
1254
1255         spin_lock_init(&iommu->lock);
1256
1257         /* TBD: there might be 64K domains,
1258          * consider other allocation for future chip
1259          */
1260         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1261         if (!iommu->domain_ids) {
1262                 printk(KERN_ERR "Allocating domain id array failed\n");
1263                 return -ENOMEM;
1264         }
1265         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1266                         GFP_KERNEL);
1267         if (!iommu->domains) {
1268                 printk(KERN_ERR "Allocating domain array failed\n");
1269                 return -ENOMEM;
1270         }
1271
1272         /*
1273          * if Caching mode is set, then invalid translations are tagged
1274          * with domainid 0. Hence we need to pre-allocate it.
1275          */
1276         if (cap_caching_mode(iommu->cap))
1277                 set_bit(0, iommu->domain_ids);
1278         return 0;
1279 }
1280
1281
1282 static void domain_exit(struct dmar_domain *domain);
1283 static void vm_domain_exit(struct dmar_domain *domain);
1284
1285 void free_dmar_iommu(struct intel_iommu *iommu)
1286 {
1287         struct dmar_domain *domain;
1288         int i;
1289         unsigned long flags;
1290
1291         if ((iommu->domains) && (iommu->domain_ids)) {
1292                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1293                         domain = iommu->domains[i];
1294                         clear_bit(i, iommu->domain_ids);
1295
1296                         spin_lock_irqsave(&domain->iommu_lock, flags);
1297                         if (--domain->iommu_count == 0) {
1298                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1299                                         vm_domain_exit(domain);
1300                                 else
1301                                         domain_exit(domain);
1302                         }
1303                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1304                 }
1305         }
1306
1307         if (iommu->gcmd & DMA_GCMD_TE)
1308                 iommu_disable_translation(iommu);
1309
1310         if (iommu->irq) {
1311                 irq_set_handler_data(iommu->irq, NULL);
1312                 /* This will mask the irq */
1313                 free_irq(iommu->irq, iommu);
1314                 destroy_irq(iommu->irq);
1315         }
1316
1317         kfree(iommu->domains);
1318         kfree(iommu->domain_ids);
1319
1320         g_iommus[iommu->seq_id] = NULL;
1321
1322         /* if all iommus are freed, free g_iommus */
1323         for (i = 0; i < g_num_of_iommus; i++) {
1324                 if (g_iommus[i])
1325                         break;
1326         }
1327
1328         if (i == g_num_of_iommus)
1329                 kfree(g_iommus);
1330
1331         /* free context mapping */
1332         free_context_table(iommu);
1333 }
1334
1335 static struct dmar_domain *alloc_domain(void)
1336 {
1337         struct dmar_domain *domain;
1338
1339         domain = alloc_domain_mem();
1340         if (!domain)
1341                 return NULL;
1342
1343         domain->nid = -1;
1344         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1345         domain->flags = 0;
1346
1347         return domain;
1348 }
1349
1350 static int iommu_attach_domain(struct dmar_domain *domain,
1351                                struct intel_iommu *iommu)
1352 {
1353         int num;
1354         unsigned long ndomains;
1355         unsigned long flags;
1356
1357         ndomains = cap_ndoms(iommu->cap);
1358
1359         spin_lock_irqsave(&iommu->lock, flags);
1360
1361         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1362         if (num >= ndomains) {
1363                 spin_unlock_irqrestore(&iommu->lock, flags);
1364                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1365                 return -ENOMEM;
1366         }
1367
1368         domain->id = num;
1369         set_bit(num, iommu->domain_ids);
1370         set_bit(iommu->seq_id, domain->iommu_bmp);
1371         iommu->domains[num] = domain;
1372         spin_unlock_irqrestore(&iommu->lock, flags);
1373
1374         return 0;
1375 }
1376
1377 static void iommu_detach_domain(struct dmar_domain *domain,
1378                                 struct intel_iommu *iommu)
1379 {
1380         unsigned long flags;
1381         int num, ndomains;
1382         int found = 0;
1383
1384         spin_lock_irqsave(&iommu->lock, flags);
1385         ndomains = cap_ndoms(iommu->cap);
1386         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1387                 if (iommu->domains[num] == domain) {
1388                         found = 1;
1389                         break;
1390                 }
1391         }
1392
1393         if (found) {
1394                 clear_bit(num, iommu->domain_ids);
1395                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1396                 iommu->domains[num] = NULL;
1397         }
1398         spin_unlock_irqrestore(&iommu->lock, flags);
1399 }
1400
1401 static struct iova_domain reserved_iova_list;
1402 static struct lock_class_key reserved_rbtree_key;
1403
1404 static int dmar_init_reserved_ranges(void)
1405 {
1406         struct pci_dev *pdev = NULL;
1407         struct iova *iova;
1408         int i;
1409
1410         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1411
1412         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1413                 &reserved_rbtree_key);
1414
1415         /* IOAPIC ranges shouldn't be accessed by DMA */
1416         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1417                 IOVA_PFN(IOAPIC_RANGE_END));
1418         if (!iova) {
1419                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1420                 return -ENODEV;
1421         }
1422
1423         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1424         for_each_pci_dev(pdev) {
1425                 struct resource *r;
1426
1427                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1428                         r = &pdev->resource[i];
1429                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1430                                 continue;
1431                         iova = reserve_iova(&reserved_iova_list,
1432                                             IOVA_PFN(r->start),
1433                                             IOVA_PFN(r->end));
1434                         if (!iova) {
1435                                 printk(KERN_ERR "Reserve iova failed\n");
1436                                 return -ENODEV;
1437                         }
1438                 }
1439         }
1440         return 0;
1441 }
1442
1443 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1444 {
1445         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1446 }
1447
1448 static inline int guestwidth_to_adjustwidth(int gaw)
1449 {
1450         int agaw;
1451         int r = (gaw - 12) % 9;
1452
1453         if (r == 0)
1454                 agaw = gaw;
1455         else
1456                 agaw = gaw + 9 - r;
1457         if (agaw > 64)
1458                 agaw = 64;
1459         return agaw;
1460 }
1461
1462 static int domain_init(struct dmar_domain *domain, int guest_width)
1463 {
1464         struct intel_iommu *iommu;
1465         int adjust_width, agaw;
1466         unsigned long sagaw;
1467
1468         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1469         spin_lock_init(&domain->iommu_lock);
1470
1471         domain_reserve_special_ranges(domain);
1472
1473         /* calculate AGAW */
1474         iommu = domain_get_iommu(domain);
1475         if (guest_width > cap_mgaw(iommu->cap))
1476                 guest_width = cap_mgaw(iommu->cap);
1477         domain->gaw = guest_width;
1478         adjust_width = guestwidth_to_adjustwidth(guest_width);
1479         agaw = width_to_agaw(adjust_width);
1480         sagaw = cap_sagaw(iommu->cap);
1481         if (!test_bit(agaw, &sagaw)) {
1482                 /* hardware doesn't support it, choose a bigger one */
1483                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1484                 agaw = find_next_bit(&sagaw, 5, agaw);
1485                 if (agaw >= 5)
1486                         return -ENODEV;
1487         }
1488         domain->agaw = agaw;
1489         INIT_LIST_HEAD(&domain->devices);
1490
1491         if (ecap_coherent(iommu->ecap))
1492                 domain->iommu_coherency = 1;
1493         else
1494                 domain->iommu_coherency = 0;
1495
1496         if (ecap_sc_support(iommu->ecap))
1497                 domain->iommu_snooping = 1;
1498         else
1499                 domain->iommu_snooping = 0;
1500
1501         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1502         domain->iommu_count = 1;
1503         domain->nid = iommu->node;
1504
1505         /* always allocate the top pgd */
1506         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1507         if (!domain->pgd)
1508                 return -ENOMEM;
1509         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1510         return 0;
1511 }
1512
1513 static void domain_exit(struct dmar_domain *domain)
1514 {
1515         struct dmar_drhd_unit *drhd;
1516         struct intel_iommu *iommu;
1517
1518         /* Domain 0 is reserved, so dont process it */
1519         if (!domain)
1520                 return;
1521
1522         /* Flush any lazy unmaps that may reference this domain */
1523         if (!intel_iommu_strict)
1524                 flush_unmaps_timeout(0);
1525
1526         domain_remove_dev_info(domain);
1527         /* destroy iovas */
1528         put_iova_domain(&domain->iovad);
1529
1530         /* clear ptes */
1531         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1532
1533         /* free page tables */
1534         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536         for_each_active_iommu(iommu, drhd)
1537                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1538                         iommu_detach_domain(domain, iommu);
1539
1540         free_domain_mem(domain);
1541 }
1542
1543 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1544                                  u8 bus, u8 devfn, int translation)
1545 {
1546         struct context_entry *context;
1547         unsigned long flags;
1548         struct intel_iommu *iommu;
1549         struct dma_pte *pgd;
1550         unsigned long num;
1551         unsigned long ndomains;
1552         int id;
1553         int agaw;
1554         struct device_domain_info *info = NULL;
1555
1556         pr_debug("Set context mapping for %02x:%02x.%d\n",
1557                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1558
1559         BUG_ON(!domain->pgd);
1560         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1561                translation != CONTEXT_TT_MULTI_LEVEL);
1562
1563         iommu = device_to_iommu(segment, bus, devfn);
1564         if (!iommu)
1565                 return -ENODEV;
1566
1567         context = device_to_context_entry(iommu, bus, devfn);
1568         if (!context)
1569                 return -ENOMEM;
1570         spin_lock_irqsave(&iommu->lock, flags);
1571         if (context_present(context)) {
1572                 spin_unlock_irqrestore(&iommu->lock, flags);
1573                 return 0;
1574         }
1575
1576         id = domain->id;
1577         pgd = domain->pgd;
1578
1579         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1580             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1581                 int found = 0;
1582
1583                 /* find an available domain id for this device in iommu */
1584                 ndomains = cap_ndoms(iommu->cap);
1585                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1586                         if (iommu->domains[num] == domain) {
1587                                 id = num;
1588                                 found = 1;
1589                                 break;
1590                         }
1591                 }
1592
1593                 if (found == 0) {
1594                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1595                         if (num >= ndomains) {
1596                                 spin_unlock_irqrestore(&iommu->lock, flags);
1597                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1598                                 return -EFAULT;
1599                         }
1600
1601                         set_bit(num, iommu->domain_ids);
1602                         iommu->domains[num] = domain;
1603                         id = num;
1604                 }
1605
1606                 /* Skip top levels of page tables for
1607                  * iommu which has less agaw than default.
1608                  * Unnecessary for PT mode.
1609                  */
1610                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1611                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1612                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1613                                 if (!dma_pte_present(pgd)) {
1614                                         spin_unlock_irqrestore(&iommu->lock, flags);
1615                                         return -ENOMEM;
1616                                 }
1617                         }
1618                 }
1619         }
1620
1621         context_set_domain_id(context, id);
1622
1623         if (translation != CONTEXT_TT_PASS_THROUGH) {
1624                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1625                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1626                                      CONTEXT_TT_MULTI_LEVEL;
1627         }
1628         /*
1629          * In pass through mode, AW must be programmed to indicate the largest
1630          * AGAW value supported by hardware. And ASR is ignored by hardware.
1631          */
1632         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1633                 context_set_address_width(context, iommu->msagaw);
1634         else {
1635                 context_set_address_root(context, virt_to_phys(pgd));
1636                 context_set_address_width(context, iommu->agaw);
1637         }
1638
1639         context_set_translation_type(context, translation);
1640         context_set_fault_enable(context);
1641         context_set_present(context);
1642         domain_flush_cache(domain, context, sizeof(*context));
1643
1644         /*
1645          * It's a non-present to present mapping. If hardware doesn't cache
1646          * non-present entry we only need to flush the write-buffer. If the
1647          * _does_ cache non-present entries, then it does so in the special
1648          * domain #0, which we have to flush:
1649          */
1650         if (cap_caching_mode(iommu->cap)) {
1651                 iommu->flush.flush_context(iommu, 0,
1652                                            (((u16)bus) << 8) | devfn,
1653                                            DMA_CCMD_MASK_NOBIT,
1654                                            DMA_CCMD_DEVICE_INVL);
1655                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1656         } else {
1657                 iommu_flush_write_buffer(iommu);
1658         }
1659         iommu_enable_dev_iotlb(info);
1660         spin_unlock_irqrestore(&iommu->lock, flags);
1661
1662         spin_lock_irqsave(&domain->iommu_lock, flags);
1663         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1664                 domain->iommu_count++;
1665                 if (domain->iommu_count == 1)
1666                         domain->nid = iommu->node;
1667                 domain_update_iommu_cap(domain);
1668         }
1669         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1670         return 0;
1671 }
1672
1673 static int
1674 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1675                         int translation)
1676 {
1677         int ret;
1678         struct pci_dev *tmp, *parent;
1679
1680         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1681                                          pdev->bus->number, pdev->devfn,
1682                                          translation);
1683         if (ret)
1684                 return ret;
1685
1686         /* dependent device mapping */
1687         tmp = pci_find_upstream_pcie_bridge(pdev);
1688         if (!tmp)
1689                 return 0;
1690         /* Secondary interface's bus number and devfn 0 */
1691         parent = pdev->bus->self;
1692         while (parent != tmp) {
1693                 ret = domain_context_mapping_one(domain,
1694                                                  pci_domain_nr(parent->bus),
1695                                                  parent->bus->number,
1696                                                  parent->devfn, translation);
1697                 if (ret)
1698                         return ret;
1699                 parent = parent->bus->self;
1700         }
1701         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1702                 return domain_context_mapping_one(domain,
1703                                         pci_domain_nr(tmp->subordinate),
1704                                         tmp->subordinate->number, 0,
1705                                         translation);
1706         else /* this is a legacy PCI bridge */
1707                 return domain_context_mapping_one(domain,
1708                                                   pci_domain_nr(tmp->bus),
1709                                                   tmp->bus->number,
1710                                                   tmp->devfn,
1711                                                   translation);
1712 }
1713
1714 static int domain_context_mapped(struct pci_dev *pdev)
1715 {
1716         int ret;
1717         struct pci_dev *tmp, *parent;
1718         struct intel_iommu *iommu;
1719
1720         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1721                                 pdev->devfn);
1722         if (!iommu)
1723                 return -ENODEV;
1724
1725         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1726         if (!ret)
1727                 return ret;
1728         /* dependent device mapping */
1729         tmp = pci_find_upstream_pcie_bridge(pdev);
1730         if (!tmp)
1731                 return ret;
1732         /* Secondary interface's bus number and devfn 0 */
1733         parent = pdev->bus->self;
1734         while (parent != tmp) {
1735                 ret = device_context_mapped(iommu, parent->bus->number,
1736                                             parent->devfn);
1737                 if (!ret)
1738                         return ret;
1739                 parent = parent->bus->self;
1740         }
1741         if (pci_is_pcie(tmp))
1742                 return device_context_mapped(iommu, tmp->subordinate->number,
1743                                              0);
1744         else
1745                 return device_context_mapped(iommu, tmp->bus->number,
1746                                              tmp->devfn);
1747 }
1748
1749 /* Returns a number of VTD pages, but aligned to MM page size */
1750 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1751                                             size_t size)
1752 {
1753         host_addr &= ~PAGE_MASK;
1754         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1755 }
1756
1757 /* Return largest possible superpage level for a given mapping */
1758 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1759                                           unsigned long iov_pfn,
1760                                           unsigned long phy_pfn,
1761                                           unsigned long pages)
1762 {
1763         int support, level = 1;
1764         unsigned long pfnmerge;
1765
1766         support = domain->iommu_superpage;
1767
1768         /* To use a large page, the virtual *and* physical addresses
1769            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1770            of them will mean we have to use smaller pages. So just
1771            merge them and check both at once. */
1772         pfnmerge = iov_pfn | phy_pfn;
1773
1774         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1775                 pages >>= VTD_STRIDE_SHIFT;
1776                 if (!pages)
1777                         break;
1778                 pfnmerge >>= VTD_STRIDE_SHIFT;
1779                 level++;
1780                 support--;
1781         }
1782         return level;
1783 }
1784
1785 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1786                             struct scatterlist *sg, unsigned long phys_pfn,
1787                             unsigned long nr_pages, int prot)
1788 {
1789         struct dma_pte *first_pte = NULL, *pte = NULL;
1790         phys_addr_t uninitialized_var(pteval);
1791         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1792         unsigned long sg_res;
1793         unsigned int largepage_lvl = 0;
1794         unsigned long lvl_pages = 0;
1795
1796         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1797
1798         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1799                 return -EINVAL;
1800
1801         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1802
1803         if (sg)
1804                 sg_res = 0;
1805         else {
1806                 sg_res = nr_pages + 1;
1807                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1808         }
1809
1810         while (nr_pages > 0) {
1811                 uint64_t tmp;
1812
1813                 if (!sg_res) {
1814                         sg_res = aligned_nrpages(sg->offset, sg->length);
1815                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1816                         sg->dma_length = sg->length;
1817                         pteval = page_to_phys(sg_page(sg)) | prot;
1818                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1819                 }
1820
1821                 if (!pte) {
1822                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1823
1824                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1825                         if (!pte)
1826                                 return -ENOMEM;
1827                         /* It is large page*/
1828                         if (largepage_lvl > 1)
1829                                 pteval |= DMA_PTE_LARGE_PAGE;
1830                         else
1831                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1832
1833                 }
1834                 /* We don't need lock here, nobody else
1835                  * touches the iova range
1836                  */
1837                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1838                 if (tmp) {
1839                         static int dumps = 5;
1840                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1841                                iov_pfn, tmp, (unsigned long long)pteval);
1842                         if (dumps) {
1843                                 dumps--;
1844                                 debug_dma_dump_mappings(NULL);
1845                         }
1846                         WARN_ON(1);
1847                 }
1848
1849                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1850
1851                 BUG_ON(nr_pages < lvl_pages);
1852                 BUG_ON(sg_res < lvl_pages);
1853
1854                 nr_pages -= lvl_pages;
1855                 iov_pfn += lvl_pages;
1856                 phys_pfn += lvl_pages;
1857                 pteval += lvl_pages * VTD_PAGE_SIZE;
1858                 sg_res -= lvl_pages;
1859
1860                 /* If the next PTE would be the first in a new page, then we
1861                    need to flush the cache on the entries we've just written.
1862                    And then we'll need to recalculate 'pte', so clear it and
1863                    let it get set again in the if (!pte) block above.
1864
1865                    If we're done (!nr_pages) we need to flush the cache too.
1866
1867                    Also if we've been setting superpages, we may need to
1868                    recalculate 'pte' and switch back to smaller pages for the
1869                    end of the mapping, if the trailing size is not enough to
1870                    use another superpage (i.e. sg_res < lvl_pages). */
1871                 pte++;
1872                 if (!nr_pages || first_pte_in_page(pte) ||
1873                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1874                         domain_flush_cache(domain, first_pte,
1875                                            (void *)pte - (void *)first_pte);
1876                         pte = NULL;
1877                 }
1878
1879                 if (!sg_res && nr_pages)
1880                         sg = sg_next(sg);
1881         }
1882         return 0;
1883 }
1884
1885 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                                     struct scatterlist *sg, unsigned long nr_pages,
1887                                     int prot)
1888 {
1889         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1890 }
1891
1892 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1893                                      unsigned long phys_pfn, unsigned long nr_pages,
1894                                      int prot)
1895 {
1896         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1897 }
1898
1899 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1900 {
1901         if (!iommu)
1902                 return;
1903
1904         clear_context_table(iommu, bus, devfn);
1905         iommu->flush.flush_context(iommu, 0, 0, 0,
1906                                            DMA_CCMD_GLOBAL_INVL);
1907         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1908 }
1909
1910 static inline void unlink_domain_info(struct device_domain_info *info)
1911 {
1912         assert_spin_locked(&device_domain_lock);
1913         list_del(&info->link);
1914         list_del(&info->global);
1915         if (info->dev)
1916                 info->dev->dev.archdata.iommu = NULL;
1917 }
1918
1919 static void domain_remove_dev_info(struct dmar_domain *domain)
1920 {
1921         struct device_domain_info *info;
1922         unsigned long flags;
1923         struct intel_iommu *iommu;
1924
1925         spin_lock_irqsave(&device_domain_lock, flags);
1926         while (!list_empty(&domain->devices)) {
1927                 info = list_entry(domain->devices.next,
1928                         struct device_domain_info, link);
1929                 unlink_domain_info(info);
1930                 spin_unlock_irqrestore(&device_domain_lock, flags);
1931
1932                 iommu_disable_dev_iotlb(info);
1933                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1934                 iommu_detach_dev(iommu, info->bus, info->devfn);
1935                 free_devinfo_mem(info);
1936
1937                 spin_lock_irqsave(&device_domain_lock, flags);
1938         }
1939         spin_unlock_irqrestore(&device_domain_lock, flags);
1940 }
1941
1942 /*
1943  * find_domain
1944  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1945  */
1946 static struct dmar_domain *
1947 find_domain(struct pci_dev *pdev)
1948 {
1949         struct device_domain_info *info;
1950
1951         /* No lock here, assumes no domain exit in normal case */
1952         info = pdev->dev.archdata.iommu;
1953         if (info)
1954                 return info->domain;
1955         return NULL;
1956 }
1957
1958 /* domain is initialized */
1959 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1960 {
1961         struct dmar_domain *domain, *found = NULL;
1962         struct intel_iommu *iommu;
1963         struct dmar_drhd_unit *drhd;
1964         struct device_domain_info *info, *tmp;
1965         struct pci_dev *dev_tmp;
1966         unsigned long flags;
1967         int bus = 0, devfn = 0;
1968         int segment;
1969         int ret;
1970
1971         domain = find_domain(pdev);
1972         if (domain)
1973                 return domain;
1974
1975         segment = pci_domain_nr(pdev->bus);
1976
1977         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1978         if (dev_tmp) {
1979                 if (pci_is_pcie(dev_tmp)) {
1980                         bus = dev_tmp->subordinate->number;
1981                         devfn = 0;
1982                 } else {
1983                         bus = dev_tmp->bus->number;
1984                         devfn = dev_tmp->devfn;
1985                 }
1986                 spin_lock_irqsave(&device_domain_lock, flags);
1987                 list_for_each_entry(info, &device_domain_list, global) {
1988                         if (info->segment == segment &&
1989                             info->bus == bus && info->devfn == devfn) {
1990                                 found = info->domain;
1991                                 break;
1992                         }
1993                 }
1994                 spin_unlock_irqrestore(&device_domain_lock, flags);
1995                 /* pcie-pci bridge already has a domain, uses it */
1996                 if (found) {
1997                         domain = found;
1998                         goto found_domain;
1999                 }
2000         }
2001
2002         domain = alloc_domain();
2003         if (!domain)
2004                 goto error;
2005
2006         /* Allocate new domain for the device */
2007         drhd = dmar_find_matched_drhd_unit(pdev);
2008         if (!drhd) {
2009                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2010                         pci_name(pdev));
2011                 return NULL;
2012         }
2013         iommu = drhd->iommu;
2014
2015         ret = iommu_attach_domain(domain, iommu);
2016         if (ret) {
2017                 free_domain_mem(domain);
2018                 goto error;
2019         }
2020
2021         if (domain_init(domain, gaw)) {
2022                 domain_exit(domain);
2023                 goto error;
2024         }
2025
2026         /* register pcie-to-pci device */
2027         if (dev_tmp) {
2028                 info = alloc_devinfo_mem();
2029                 if (!info) {
2030                         domain_exit(domain);
2031                         goto error;
2032                 }
2033                 info->segment = segment;
2034                 info->bus = bus;
2035                 info->devfn = devfn;
2036                 info->dev = NULL;
2037                 info->domain = domain;
2038                 /* This domain is shared by devices under p2p bridge */
2039                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2040
2041                 /* pcie-to-pci bridge already has a domain, uses it */
2042                 found = NULL;
2043                 spin_lock_irqsave(&device_domain_lock, flags);
2044                 list_for_each_entry(tmp, &device_domain_list, global) {
2045                         if (tmp->segment == segment &&
2046                             tmp->bus == bus && tmp->devfn == devfn) {
2047                                 found = tmp->domain;
2048                                 break;
2049                         }
2050                 }
2051                 if (found) {
2052                         spin_unlock_irqrestore(&device_domain_lock, flags);
2053                         free_devinfo_mem(info);
2054                         domain_exit(domain);
2055                         domain = found;
2056                 } else {
2057                         list_add(&info->link, &domain->devices);
2058                         list_add(&info->global, &device_domain_list);
2059                         spin_unlock_irqrestore(&device_domain_lock, flags);
2060                 }
2061         }
2062
2063 found_domain:
2064         info = alloc_devinfo_mem();
2065         if (!info)
2066                 goto error;
2067         info->segment = segment;
2068         info->bus = pdev->bus->number;
2069         info->devfn = pdev->devfn;
2070         info->dev = pdev;
2071         info->domain = domain;
2072         spin_lock_irqsave(&device_domain_lock, flags);
2073         /* somebody is fast */
2074         found = find_domain(pdev);
2075         if (found != NULL) {
2076                 spin_unlock_irqrestore(&device_domain_lock, flags);
2077                 if (found != domain) {
2078                         domain_exit(domain);
2079                         domain = found;
2080                 }
2081                 free_devinfo_mem(info);
2082                 return domain;
2083         }
2084         list_add(&info->link, &domain->devices);
2085         list_add(&info->global, &device_domain_list);
2086         pdev->dev.archdata.iommu = info;
2087         spin_unlock_irqrestore(&device_domain_lock, flags);
2088         return domain;
2089 error:
2090         /* recheck it here, maybe others set it */
2091         return find_domain(pdev);
2092 }
2093
2094 static int iommu_identity_mapping;
2095 #define IDENTMAP_ALL            1
2096 #define IDENTMAP_GFX            2
2097 #define IDENTMAP_AZALIA         4
2098
2099 static int iommu_domain_identity_map(struct dmar_domain *domain,
2100                                      unsigned long long start,
2101                                      unsigned long long end)
2102 {
2103         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2104         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2105
2106         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2107                           dma_to_mm_pfn(last_vpfn))) {
2108                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2109                 return -ENOMEM;
2110         }
2111
2112         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2113                  start, end, domain->id);
2114         /*
2115          * RMRR range might have overlap with physical memory range,
2116          * clear it first
2117          */
2118         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2119
2120         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2121                                   last_vpfn - first_vpfn + 1,
2122                                   DMA_PTE_READ|DMA_PTE_WRITE);
2123 }
2124
2125 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2126                                       unsigned long long start,
2127                                       unsigned long long end)
2128 {
2129         struct dmar_domain *domain;
2130         int ret;
2131
2132         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2133         if (!domain)
2134                 return -ENOMEM;
2135
2136         /* For _hardware_ passthrough, don't bother. But for software
2137            passthrough, we do it anyway -- it may indicate a memory
2138            range which is reserved in E820, so which didn't get set
2139            up to start with in si_domain */
2140         if (domain == si_domain && hw_pass_through) {
2141                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2142                        pci_name(pdev), start, end);
2143                 return 0;
2144         }
2145
2146         printk(KERN_INFO
2147                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2148                pci_name(pdev), start, end);
2149         
2150         if (end < start) {
2151                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2152                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2153                         dmi_get_system_info(DMI_BIOS_VENDOR),
2154                         dmi_get_system_info(DMI_BIOS_VERSION),
2155                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2156                 ret = -EIO;
2157                 goto error;
2158         }
2159
2160         if (end >> agaw_to_width(domain->agaw)) {
2161                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2162                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2163                      agaw_to_width(domain->agaw),
2164                      dmi_get_system_info(DMI_BIOS_VENDOR),
2165                      dmi_get_system_info(DMI_BIOS_VERSION),
2166                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2167                 ret = -EIO;
2168                 goto error;
2169         }
2170
2171         ret = iommu_domain_identity_map(domain, start, end);
2172         if (ret)
2173                 goto error;
2174
2175         /* context entry init */
2176         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2177         if (ret)
2178                 goto error;
2179
2180         return 0;
2181
2182  error:
2183         domain_exit(domain);
2184         return ret;
2185 }
2186
2187 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2188         struct pci_dev *pdev)
2189 {
2190         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2191                 return 0;
2192         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2193                 rmrr->end_address);
2194 }
2195
2196 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2197 static inline void iommu_prepare_isa(void)
2198 {
2199         struct pci_dev *pdev;
2200         int ret;
2201
2202         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2203         if (!pdev)
2204                 return;
2205
2206         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2207         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2208
2209         if (ret)
2210                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2211                        "floppy might not work\n");
2212
2213 }
2214 #else
2215 static inline void iommu_prepare_isa(void)
2216 {
2217         return;
2218 }
2219 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2220
2221 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2222
2223 static int __init si_domain_init(int hw)
2224 {
2225         struct dmar_drhd_unit *drhd;
2226         struct intel_iommu *iommu;
2227         int nid, ret = 0;
2228
2229         si_domain = alloc_domain();
2230         if (!si_domain)
2231                 return -EFAULT;
2232
2233         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2234
2235         for_each_active_iommu(iommu, drhd) {
2236                 ret = iommu_attach_domain(si_domain, iommu);
2237                 if (ret) {
2238                         domain_exit(si_domain);
2239                         return -EFAULT;
2240                 }
2241         }
2242
2243         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2244                 domain_exit(si_domain);
2245                 return -EFAULT;
2246         }
2247
2248         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2249
2250         if (hw)
2251                 return 0;
2252
2253         for_each_online_node(nid) {
2254                 unsigned long start_pfn, end_pfn;
2255                 int i;
2256
2257                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2258                         ret = iommu_domain_identity_map(si_domain,
2259                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2260                         if (ret)
2261                                 return ret;
2262                 }
2263         }
2264
2265         return 0;
2266 }
2267
2268 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2269                                           struct pci_dev *pdev);
2270 static int identity_mapping(struct pci_dev *pdev)
2271 {
2272         struct device_domain_info *info;
2273
2274         if (likely(!iommu_identity_mapping))
2275                 return 0;
2276
2277         info = pdev->dev.archdata.iommu;
2278         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2279                 return (info->domain == si_domain);
2280
2281         return 0;
2282 }
2283
2284 static int domain_add_dev_info(struct dmar_domain *domain,
2285                                struct pci_dev *pdev,
2286                                int translation)
2287 {
2288         struct device_domain_info *info;
2289         unsigned long flags;
2290         int ret;
2291
2292         info = alloc_devinfo_mem();
2293         if (!info)
2294                 return -ENOMEM;
2295
2296         info->segment = pci_domain_nr(pdev->bus);
2297         info->bus = pdev->bus->number;
2298         info->devfn = pdev->devfn;
2299         info->dev = pdev;
2300         info->domain = domain;
2301
2302         spin_lock_irqsave(&device_domain_lock, flags);
2303         list_add(&info->link, &domain->devices);
2304         list_add(&info->global, &device_domain_list);
2305         pdev->dev.archdata.iommu = info;
2306         spin_unlock_irqrestore(&device_domain_lock, flags);
2307
2308         ret = domain_context_mapping(domain, pdev, translation);
2309         if (ret) {
2310                 spin_lock_irqsave(&device_domain_lock, flags);
2311                 unlink_domain_info(info);
2312                 spin_unlock_irqrestore(&device_domain_lock, flags);
2313                 free_devinfo_mem(info);
2314                 return ret;
2315         }
2316
2317         return 0;
2318 }
2319
2320 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2321 {
2322         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2323                 return 1;
2324
2325         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2326                 return 1;
2327
2328         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2329                 return 0;
2330
2331         /*
2332          * We want to start off with all devices in the 1:1 domain, and
2333          * take them out later if we find they can't access all of memory.
2334          *
2335          * However, we can't do this for PCI devices behind bridges,
2336          * because all PCI devices behind the same bridge will end up
2337          * with the same source-id on their transactions.
2338          *
2339          * Practically speaking, we can't change things around for these
2340          * devices at run-time, because we can't be sure there'll be no
2341          * DMA transactions in flight for any of their siblings.
2342          * 
2343          * So PCI devices (unless they're on the root bus) as well as
2344          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2345          * the 1:1 domain, just in _case_ one of their siblings turns out
2346          * not to be able to map all of memory.
2347          */
2348         if (!pci_is_pcie(pdev)) {
2349                 if (!pci_is_root_bus(pdev->bus))
2350                         return 0;
2351                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2352                         return 0;
2353         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2354                 return 0;
2355
2356         /* 
2357          * At boot time, we don't yet know if devices will be 64-bit capable.
2358          * Assume that they will -- if they turn out not to be, then we can 
2359          * take them out of the 1:1 domain later.
2360          */
2361         if (!startup) {
2362                 /*
2363                  * If the device's dma_mask is less than the system's memory
2364                  * size then this is not a candidate for identity mapping.
2365                  */
2366                 u64 dma_mask = pdev->dma_mask;
2367
2368                 if (pdev->dev.coherent_dma_mask &&
2369                     pdev->dev.coherent_dma_mask < dma_mask)
2370                         dma_mask = pdev->dev.coherent_dma_mask;
2371
2372                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2373         }
2374
2375         return 1;
2376 }
2377
2378 static int __init iommu_prepare_static_identity_mapping(int hw)
2379 {
2380         struct pci_dev *pdev = NULL;
2381         int ret;
2382
2383         ret = si_domain_init(hw);
2384         if (ret)
2385                 return -EFAULT;
2386
2387         for_each_pci_dev(pdev) {
2388                 if (iommu_should_identity_map(pdev, 1)) {
2389                         ret = domain_add_dev_info(si_domain, pdev,
2390                                              hw ? CONTEXT_TT_PASS_THROUGH :
2391                                                   CONTEXT_TT_MULTI_LEVEL);
2392                         if (ret) {
2393                                 /* device not associated with an iommu */
2394                                 if (ret == -ENODEV)
2395                                         continue;
2396                                 return ret;
2397                         }
2398                         pr_info("IOMMU: %s identity mapping for device %s\n",
2399                                 hw ? "hardware" : "software", pci_name(pdev));
2400                 }
2401         }
2402
2403         return 0;
2404 }
2405
2406 static int __init init_dmars(void)
2407 {
2408         struct dmar_drhd_unit *drhd;
2409         struct dmar_rmrr_unit *rmrr;
2410         struct pci_dev *pdev;
2411         struct intel_iommu *iommu;
2412         int i, ret;
2413
2414         /*
2415          * for each drhd
2416          *    allocate root
2417          *    initialize and program root entry to not present
2418          * endfor
2419          */
2420         for_each_drhd_unit(drhd) {
2421                 /*
2422                  * lock not needed as this is only incremented in the single
2423                  * threaded kernel __init code path all other access are read
2424                  * only
2425                  */
2426                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2427                         g_num_of_iommus++;
2428                         continue;
2429                 }
2430                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2431                           IOMMU_UNITS_SUPPORTED);
2432         }
2433
2434         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2435                         GFP_KERNEL);
2436         if (!g_iommus) {
2437                 printk(KERN_ERR "Allocating global iommu array failed\n");
2438                 ret = -ENOMEM;
2439                 goto error;
2440         }
2441
2442         deferred_flush = kzalloc(g_num_of_iommus *
2443                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2444         if (!deferred_flush) {
2445                 ret = -ENOMEM;
2446                 goto error;
2447         }
2448
2449         for_each_drhd_unit(drhd) {
2450                 if (drhd->ignored)
2451                         continue;
2452
2453                 iommu = drhd->iommu;
2454                 g_iommus[iommu->seq_id] = iommu;
2455
2456                 ret = iommu_init_domains(iommu);
2457                 if (ret)
2458                         goto error;
2459
2460                 /*
2461                  * TBD:
2462                  * we could share the same root & context tables
2463                  * among all IOMMU's. Need to Split it later.
2464                  */
2465                 ret = iommu_alloc_root_entry(iommu);
2466                 if (ret) {
2467                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2468                         goto error;
2469                 }
2470                 if (!ecap_pass_through(iommu->ecap))
2471                         hw_pass_through = 0;
2472         }
2473
2474         /*
2475          * Start from the sane iommu hardware state.
2476          */
2477         for_each_drhd_unit(drhd) {
2478                 if (drhd->ignored)
2479                         continue;
2480
2481                 iommu = drhd->iommu;
2482
2483                 /*
2484                  * If the queued invalidation is already initialized by us
2485                  * (for example, while enabling interrupt-remapping) then
2486                  * we got the things already rolling from a sane state.
2487                  */
2488                 if (iommu->qi)
2489                         continue;
2490
2491                 /*
2492                  * Clear any previous faults.
2493                  */
2494                 dmar_fault(-1, iommu);
2495                 /*
2496                  * Disable queued invalidation if supported and already enabled
2497                  * before OS handover.
2498                  */
2499                 dmar_disable_qi(iommu);
2500         }
2501
2502         for_each_drhd_unit(drhd) {
2503                 if (drhd->ignored)
2504                         continue;
2505
2506                 iommu = drhd->iommu;
2507
2508                 if (dmar_enable_qi(iommu)) {
2509                         /*
2510                          * Queued Invalidate not enabled, use Register Based
2511                          * Invalidate
2512                          */
2513                         iommu->flush.flush_context = __iommu_flush_context;
2514                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2515                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2516                                "invalidation\n",
2517                                 iommu->seq_id,
2518                                (unsigned long long)drhd->reg_base_addr);
2519                 } else {
2520                         iommu->flush.flush_context = qi_flush_context;
2521                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2522                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2523                                "invalidation\n",
2524                                 iommu->seq_id,
2525                                (unsigned long long)drhd->reg_base_addr);
2526                 }
2527         }
2528
2529         if (iommu_pass_through)
2530                 iommu_identity_mapping |= IDENTMAP_ALL;
2531
2532 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2533         iommu_identity_mapping |= IDENTMAP_GFX;
2534 #endif
2535
2536         check_tylersburg_isoch();
2537
2538         /*
2539          * If pass through is not set or not enabled, setup context entries for
2540          * identity mappings for rmrr, gfx, and isa and may fall back to static
2541          * identity mapping if iommu_identity_mapping is set.
2542          */
2543         if (iommu_identity_mapping) {
2544                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2545                 if (ret) {
2546                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2547                         goto error;
2548                 }
2549         }
2550         /*
2551          * For each rmrr
2552          *   for each dev attached to rmrr
2553          *   do
2554          *     locate drhd for dev, alloc domain for dev
2555          *     allocate free domain
2556          *     allocate page table entries for rmrr
2557          *     if context not allocated for bus
2558          *           allocate and init context
2559          *           set present in root table for this bus
2560          *     init context with domain, translation etc
2561          *    endfor
2562          * endfor
2563          */
2564         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2565         for_each_rmrr_units(rmrr) {
2566                 for (i = 0; i < rmrr->devices_cnt; i++) {
2567                         pdev = rmrr->devices[i];
2568                         /*
2569                          * some BIOS lists non-exist devices in DMAR
2570                          * table.
2571                          */
2572                         if (!pdev)
2573                                 continue;
2574                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2575                         if (ret)
2576                                 printk(KERN_ERR
2577                                        "IOMMU: mapping reserved region failed\n");
2578                 }
2579         }
2580
2581         iommu_prepare_isa();
2582
2583         /*
2584          * for each drhd
2585          *   enable fault log
2586          *   global invalidate context cache
2587          *   global invalidate iotlb
2588          *   enable translation
2589          */
2590         for_each_drhd_unit(drhd) {
2591                 if (drhd->ignored) {
2592                         /*
2593                          * we always have to disable PMRs or DMA may fail on
2594                          * this device
2595                          */
2596                         if (force_on)
2597                                 iommu_disable_protect_mem_regions(drhd->iommu);
2598                         continue;
2599                 }
2600                 iommu = drhd->iommu;
2601
2602                 iommu_flush_write_buffer(iommu);
2603
2604                 ret = dmar_set_interrupt(iommu);
2605                 if (ret)
2606                         goto error;
2607
2608                 iommu_set_root_entry(iommu);
2609
2610                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2611                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2612
2613                 ret = iommu_enable_translation(iommu);
2614                 if (ret)
2615                         goto error;
2616
2617                 iommu_disable_protect_mem_regions(iommu);
2618         }
2619
2620         return 0;
2621 error:
2622         for_each_drhd_unit(drhd) {
2623                 if (drhd->ignored)
2624                         continue;
2625                 iommu = drhd->iommu;
2626                 free_iommu(iommu);
2627         }
2628         kfree(g_iommus);
2629         return ret;
2630 }
2631
2632 /* This takes a number of _MM_ pages, not VTD pages */
2633 static struct iova *intel_alloc_iova(struct device *dev,
2634                                      struct dmar_domain *domain,
2635                                      unsigned long nrpages, uint64_t dma_mask)
2636 {
2637         struct pci_dev *pdev = to_pci_dev(dev);
2638         struct iova *iova = NULL;
2639
2640         /* Restrict dma_mask to the width that the iommu can handle */
2641         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2642
2643         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2644                 /*
2645                  * First try to allocate an io virtual address in
2646                  * DMA_BIT_MASK(32) and if that fails then try allocating
2647                  * from higher range
2648                  */
2649                 iova = alloc_iova(&domain->iovad, nrpages,
2650                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2651                 if (iova)
2652                         return iova;
2653         }
2654         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2655         if (unlikely(!iova)) {
2656                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2657                        nrpages, pci_name(pdev));
2658                 return NULL;
2659         }
2660
2661         return iova;
2662 }
2663
2664 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2665 {
2666         struct dmar_domain *domain;
2667         int ret;
2668
2669         domain = get_domain_for_dev(pdev,
2670                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2671         if (!domain) {
2672                 printk(KERN_ERR
2673                         "Allocating domain for %s failed", pci_name(pdev));
2674                 return NULL;
2675         }
2676
2677         /* make sure context mapping is ok */
2678         if (unlikely(!domain_context_mapped(pdev))) {
2679                 ret = domain_context_mapping(domain, pdev,
2680                                              CONTEXT_TT_MULTI_LEVEL);
2681                 if (ret) {
2682                         printk(KERN_ERR
2683                                 "Domain context map for %s failed",
2684                                 pci_name(pdev));
2685                         return NULL;
2686                 }
2687         }
2688
2689         return domain;
2690 }
2691
2692 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2693 {
2694         struct device_domain_info *info;
2695
2696         /* No lock here, assumes no domain exit in normal case */
2697         info = dev->dev.archdata.iommu;
2698         if (likely(info))
2699                 return info->domain;
2700
2701         return __get_valid_domain_for_dev(dev);
2702 }
2703
2704 static int iommu_dummy(struct pci_dev *pdev)
2705 {
2706         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2707 }
2708
2709 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2710 static int iommu_no_mapping(struct device *dev)
2711 {
2712         struct pci_dev *pdev;
2713         int found;
2714
2715         if (unlikely(dev->bus != &pci_bus_type))
2716                 return 1;
2717
2718         pdev = to_pci_dev(dev);
2719         if (iommu_dummy(pdev))
2720                 return 1;
2721
2722         if (!iommu_identity_mapping)
2723                 return 0;
2724
2725         found = identity_mapping(pdev);
2726         if (found) {
2727                 if (iommu_should_identity_map(pdev, 0))
2728                         return 1;
2729                 else {
2730                         /*
2731                          * 32 bit DMA is removed from si_domain and fall back
2732                          * to non-identity mapping.
2733                          */
2734                         domain_remove_one_dev_info(si_domain, pdev);
2735                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2736                                pci_name(pdev));
2737                         return 0;
2738                 }
2739         } else {
2740                 /*
2741                  * In case of a detached 64 bit DMA device from vm, the device
2742                  * is put into si_domain for identity mapping.
2743                  */
2744                 if (iommu_should_identity_map(pdev, 0)) {
2745                         int ret;
2746                         ret = domain_add_dev_info(si_domain, pdev,
2747                                                   hw_pass_through ?
2748                                                   CONTEXT_TT_PASS_THROUGH :
2749                                                   CONTEXT_TT_MULTI_LEVEL);
2750                         if (!ret) {
2751                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2752                                        pci_name(pdev));
2753                                 return 1;
2754                         }
2755                 }
2756         }
2757
2758         return 0;
2759 }
2760
2761 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2762                                      size_t size, int dir, u64 dma_mask)
2763 {
2764         struct pci_dev *pdev = to_pci_dev(hwdev);
2765         struct dmar_domain *domain;
2766         phys_addr_t start_paddr;
2767         struct iova *iova;
2768         int prot = 0;
2769         int ret;
2770         struct intel_iommu *iommu;
2771         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2772
2773         BUG_ON(dir == DMA_NONE);
2774
2775         if (iommu_no_mapping(hwdev))
2776                 return paddr;
2777
2778         domain = get_valid_domain_for_dev(pdev);
2779         if (!domain)
2780                 return 0;
2781
2782         iommu = domain_get_iommu(domain);
2783         size = aligned_nrpages(paddr, size);
2784
2785         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2786         if (!iova)
2787                 goto error;
2788
2789         /*
2790          * Check if DMAR supports zero-length reads on write only
2791          * mappings..
2792          */
2793         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2794                         !cap_zlr(iommu->cap))
2795                 prot |= DMA_PTE_READ;
2796         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2797                 prot |= DMA_PTE_WRITE;
2798         /*
2799          * paddr - (paddr + size) might be partial page, we should map the whole
2800          * page.  Note: if two part of one page are separately mapped, we
2801          * might have two guest_addr mapping to the same host paddr, but this
2802          * is not a big problem
2803          */
2804         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2805                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2806         if (ret)
2807                 goto error;
2808
2809         /* it's a non-present to present mapping. Only flush if caching mode */
2810         if (cap_caching_mode(iommu->cap))
2811                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2812         else
2813                 iommu_flush_write_buffer(iommu);
2814
2815         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2816         start_paddr += paddr & ~PAGE_MASK;
2817         return start_paddr;
2818
2819 error:
2820         if (iova)
2821                 __free_iova(&domain->iovad, iova);
2822         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2823                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2824         return 0;
2825 }
2826
2827 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2828                                  unsigned long offset, size_t size,
2829                                  enum dma_data_direction dir,
2830                                  struct dma_attrs *attrs)
2831 {
2832         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2833                                   dir, to_pci_dev(dev)->dma_mask);
2834 }
2835
2836 static void flush_unmaps(void)
2837 {
2838         int i, j;
2839
2840         timer_on = 0;
2841
2842         /* just flush them all */
2843         for (i = 0; i < g_num_of_iommus; i++) {
2844                 struct intel_iommu *iommu = g_iommus[i];
2845                 if (!iommu)
2846                         continue;
2847
2848                 if (!deferred_flush[i].next)
2849                         continue;
2850
2851                 /* In caching mode, global flushes turn emulation expensive */
2852                 if (!cap_caching_mode(iommu->cap))
2853                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2854                                          DMA_TLB_GLOBAL_FLUSH);
2855                 for (j = 0; j < deferred_flush[i].next; j++) {
2856                         unsigned long mask;
2857                         struct iova *iova = deferred_flush[i].iova[j];
2858                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2859
2860                         /* On real hardware multiple invalidations are expensive */
2861                         if (cap_caching_mode(iommu->cap))
2862                                 iommu_flush_iotlb_psi(iommu, domain->id,
2863                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2864                         else {
2865                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2866                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2867                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2868                         }
2869                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2870                 }
2871                 deferred_flush[i].next = 0;
2872         }
2873
2874         list_size = 0;
2875 }
2876
2877 static void flush_unmaps_timeout(unsigned long data)
2878 {
2879         unsigned long flags;
2880
2881         spin_lock_irqsave(&async_umap_flush_lock, flags);
2882         flush_unmaps();
2883         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2884 }
2885
2886 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2887 {
2888         unsigned long flags;
2889         int next, iommu_id;
2890         struct intel_iommu *iommu;
2891
2892         spin_lock_irqsave(&async_umap_flush_lock, flags);
2893         if (list_size == HIGH_WATER_MARK)
2894                 flush_unmaps();
2895
2896         iommu = domain_get_iommu(dom);
2897         iommu_id = iommu->seq_id;
2898
2899         next = deferred_flush[iommu_id].next;
2900         deferred_flush[iommu_id].domain[next] = dom;
2901         deferred_flush[iommu_id].iova[next] = iova;
2902         deferred_flush[iommu_id].next++;
2903
2904         if (!timer_on) {
2905                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2906                 timer_on = 1;
2907         }
2908         list_size++;
2909         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2910 }
2911
2912 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2913                              size_t size, enum dma_data_direction dir,
2914                              struct dma_attrs *attrs)
2915 {
2916         struct pci_dev *pdev = to_pci_dev(dev);
2917         struct dmar_domain *domain;
2918         unsigned long start_pfn, last_pfn;
2919         struct iova *iova;
2920         struct intel_iommu *iommu;
2921
2922         if (iommu_no_mapping(dev))
2923                 return;
2924
2925         domain = find_domain(pdev);
2926         BUG_ON(!domain);
2927
2928         iommu = domain_get_iommu(domain);
2929
2930         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2931         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2932                       (unsigned long long)dev_addr))
2933                 return;
2934
2935         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2936         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2937
2938         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2939                  pci_name(pdev), start_pfn, last_pfn);
2940
2941         /*  clear the whole page */
2942         dma_pte_clear_range(domain, start_pfn, last_pfn);
2943
2944         /* free page tables */
2945         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2946
2947         if (intel_iommu_strict) {
2948                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2949                                       last_pfn - start_pfn + 1, 0);
2950                 /* free iova */
2951                 __free_iova(&domain->iovad, iova);
2952         } else {
2953                 add_unmap(domain, iova);
2954                 /*
2955                  * queue up the release of the unmap to save the 1/6th of the
2956                  * cpu used up by the iotlb flush operation...
2957                  */
2958         }
2959 }
2960
2961 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2962                                   dma_addr_t *dma_handle, gfp_t flags,
2963                                   struct dma_attrs *attrs)
2964 {
2965         void *vaddr;
2966         int order;
2967
2968         size = PAGE_ALIGN(size);
2969         order = get_order(size);
2970
2971         if (!iommu_no_mapping(hwdev))
2972                 flags &= ~(GFP_DMA | GFP_DMA32);
2973         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2974                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2975                         flags |= GFP_DMA;
2976                 else
2977                         flags |= GFP_DMA32;
2978         }
2979
2980         vaddr = (void *)__get_free_pages(flags, order);
2981         if (!vaddr)
2982                 return NULL;
2983         memset(vaddr, 0, size);
2984
2985         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2986                                          DMA_BIDIRECTIONAL,
2987                                          hwdev->coherent_dma_mask);
2988         if (*dma_handle)
2989                 return vaddr;
2990         free_pages((unsigned long)vaddr, order);
2991         return NULL;
2992 }
2993
2994 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2995                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
2996 {
2997         int order;
2998
2999         size = PAGE_ALIGN(size);
3000         order = get_order(size);
3001
3002         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3003         free_pages((unsigned long)vaddr, order);
3004 }
3005
3006 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3007                            int nelems, enum dma_data_direction dir,
3008                            struct dma_attrs *attrs)
3009 {
3010         struct pci_dev *pdev = to_pci_dev(hwdev);
3011         struct dmar_domain *domain;
3012         unsigned long start_pfn, last_pfn;
3013         struct iova *iova;
3014         struct intel_iommu *iommu;
3015
3016         if (iommu_no_mapping(hwdev))
3017                 return;
3018
3019         domain = find_domain(pdev);
3020         BUG_ON(!domain);
3021
3022         iommu = domain_get_iommu(domain);
3023
3024         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3025         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3026                       (unsigned long long)sglist[0].dma_address))
3027                 return;
3028
3029         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3030         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3031
3032         /*  clear the whole page */
3033         dma_pte_clear_range(domain, start_pfn, last_pfn);
3034
3035         /* free page tables */
3036         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3037
3038         if (intel_iommu_strict) {
3039                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3040                                       last_pfn - start_pfn + 1, 0);
3041                 /* free iova */
3042                 __free_iova(&domain->iovad, iova);
3043         } else {
3044                 add_unmap(domain, iova);
3045                 /*
3046                  * queue up the release of the unmap to save the 1/6th of the
3047                  * cpu used up by the iotlb flush operation...
3048                  */
3049         }
3050 }
3051
3052 static int intel_nontranslate_map_sg(struct device *hddev,
3053         struct scatterlist *sglist, int nelems, int dir)
3054 {
3055         int i;
3056         struct scatterlist *sg;
3057
3058         for_each_sg(sglist, sg, nelems, i) {
3059                 BUG_ON(!sg_page(sg));
3060                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3061                 sg->dma_length = sg->length;
3062         }
3063         return nelems;
3064 }
3065
3066 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3067                         enum dma_data_direction dir, struct dma_attrs *attrs)
3068 {
3069         int i;
3070         struct pci_dev *pdev = to_pci_dev(hwdev);
3071         struct dmar_domain *domain;
3072         size_t size = 0;
3073         int prot = 0;
3074         struct iova *iova = NULL;
3075         int ret;
3076         struct scatterlist *sg;
3077         unsigned long start_vpfn;
3078         struct intel_iommu *iommu;
3079
3080         BUG_ON(dir == DMA_NONE);
3081         if (iommu_no_mapping(hwdev))
3082                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3083
3084         domain = get_valid_domain_for_dev(pdev);
3085         if (!domain)
3086                 return 0;
3087
3088         iommu = domain_get_iommu(domain);
3089
3090         for_each_sg(sglist, sg, nelems, i)
3091                 size += aligned_nrpages(sg->offset, sg->length);
3092
3093         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3094                                 pdev->dma_mask);
3095         if (!iova) {
3096                 sglist->dma_length = 0;
3097                 return 0;
3098         }
3099
3100         /*
3101          * Check if DMAR supports zero-length reads on write only
3102          * mappings..
3103          */
3104         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3105                         !cap_zlr(iommu->cap))
3106                 prot |= DMA_PTE_READ;
3107         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3108                 prot |= DMA_PTE_WRITE;
3109
3110         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3111
3112         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3113         if (unlikely(ret)) {
3114                 /*  clear the page */
3115                 dma_pte_clear_range(domain, start_vpfn,
3116                                     start_vpfn + size - 1);
3117                 /* free page tables */
3118                 dma_pte_free_pagetable(domain, start_vpfn,
3119                                        start_vpfn + size - 1);
3120                 /* free iova */
3121                 __free_iova(&domain->iovad, iova);
3122                 return 0;
3123         }
3124
3125         /* it's a non-present to present mapping. Only flush if caching mode */
3126         if (cap_caching_mode(iommu->cap))
3127                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3128         else
3129                 iommu_flush_write_buffer(iommu);
3130
3131         return nelems;
3132 }
3133
3134 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3135 {
3136         return !dma_addr;
3137 }
3138
3139 struct dma_map_ops intel_dma_ops = {
3140         .alloc = intel_alloc_coherent,
3141         .free = intel_free_coherent,
3142         .map_sg = intel_map_sg,
3143         .unmap_sg = intel_unmap_sg,
3144         .map_page = intel_map_page,
3145         .unmap_page = intel_unmap_page,
3146         .mapping_error = intel_mapping_error,
3147 };
3148
3149 static inline int iommu_domain_cache_init(void)
3150 {
3151         int ret = 0;
3152
3153         iommu_domain_cache = kmem_cache_create("iommu_domain",
3154                                          sizeof(struct dmar_domain),
3155                                          0,
3156                                          SLAB_HWCACHE_ALIGN,
3157
3158                                          NULL);
3159         if (!iommu_domain_cache) {
3160                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3161                 ret = -ENOMEM;
3162         }
3163
3164         return ret;
3165 }
3166
3167 static inline int iommu_devinfo_cache_init(void)
3168 {
3169         int ret = 0;
3170
3171         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3172                                          sizeof(struct device_domain_info),
3173                                          0,
3174                                          SLAB_HWCACHE_ALIGN,
3175                                          NULL);
3176         if (!iommu_devinfo_cache) {
3177                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3178                 ret = -ENOMEM;
3179         }
3180
3181         return ret;
3182 }
3183
3184 static inline int iommu_iova_cache_init(void)
3185 {
3186         int ret = 0;
3187
3188         iommu_iova_cache = kmem_cache_create("iommu_iova",
3189                                          sizeof(struct iova),
3190                                          0,
3191                                          SLAB_HWCACHE_ALIGN,
3192                                          NULL);
3193         if (!iommu_iova_cache) {
3194                 printk(KERN_ERR "Couldn't create iova cache\n");
3195                 ret = -ENOMEM;
3196         }
3197
3198         return ret;
3199 }
3200
3201 static int __init iommu_init_mempool(void)
3202 {
3203         int ret;
3204         ret = iommu_iova_cache_init();
3205         if (ret)
3206                 return ret;
3207
3208         ret = iommu_domain_cache_init();
3209         if (ret)
3210                 goto domain_error;
3211
3212         ret = iommu_devinfo_cache_init();
3213         if (!ret)
3214                 return ret;
3215
3216         kmem_cache_destroy(iommu_domain_cache);
3217 domain_error:
3218         kmem_cache_destroy(iommu_iova_cache);
3219
3220         return -ENOMEM;
3221 }
3222
3223 static void __init iommu_exit_mempool(void)
3224 {
3225         kmem_cache_destroy(iommu_devinfo_cache);
3226         kmem_cache_destroy(iommu_domain_cache);
3227         kmem_cache_destroy(iommu_iova_cache);
3228
3229 }
3230
3231 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3232 {
3233         struct dmar_drhd_unit *drhd;
3234         u32 vtbar;
3235         int rc;
3236
3237         /* We know that this device on this chipset has its own IOMMU.
3238          * If we find it under a different IOMMU, then the BIOS is lying
3239          * to us. Hope that the IOMMU for this device is actually
3240          * disabled, and it needs no translation...
3241          */
3242         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3243         if (rc) {
3244                 /* "can't" happen */
3245                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3246                 return;
3247         }
3248         vtbar &= 0xffff0000;
3249
3250         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3251         drhd = dmar_find_matched_drhd_unit(pdev);
3252         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3253                             TAINT_FIRMWARE_WORKAROUND,
3254                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3255                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3256 }
3257 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3258
3259 static void __init init_no_remapping_devices(void)
3260 {
3261         struct dmar_drhd_unit *drhd;
3262
3263         for_each_drhd_unit(drhd) {
3264                 if (!drhd->include_all) {
3265                         int i;
3266                         for (i = 0; i < drhd->devices_cnt; i++)
3267                                 if (drhd->devices[i] != NULL)
3268                                         break;
3269                         /* ignore DMAR unit if no pci devices exist */
3270                         if (i == drhd->devices_cnt)
3271                                 drhd->ignored = 1;
3272                 }
3273         }
3274
3275         for_each_drhd_unit(drhd) {
3276                 int i;
3277                 if (drhd->ignored || drhd->include_all)
3278                         continue;
3279
3280                 for (i = 0; i < drhd->devices_cnt; i++)
3281                         if (drhd->devices[i] &&
3282                             !IS_GFX_DEVICE(drhd->devices[i]))
3283                                 break;
3284
3285                 if (i < drhd->devices_cnt)
3286                         continue;
3287
3288                 /* This IOMMU has *only* gfx devices. Either bypass it or
3289                    set the gfx_mapped flag, as appropriate */
3290                 if (dmar_map_gfx) {
3291                         intel_iommu_gfx_mapped = 1;
3292                 } else {
3293                         drhd->ignored = 1;
3294                         for (i = 0; i < drhd->devices_cnt; i++) {
3295                                 if (!drhd->devices[i])
3296                                         continue;
3297                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3298                         }
3299                 }
3300         }
3301 }
3302
3303 #ifdef CONFIG_SUSPEND
3304 static int init_iommu_hw(void)
3305 {
3306         struct dmar_drhd_unit *drhd;
3307         struct intel_iommu *iommu = NULL;
3308
3309         for_each_active_iommu(iommu, drhd)
3310                 if (iommu->qi)
3311                         dmar_reenable_qi(iommu);
3312
3313         for_each_iommu(iommu, drhd) {
3314                 if (drhd->ignored) {
3315                         /*
3316                          * we always have to disable PMRs or DMA may fail on
3317                          * this device
3318                          */
3319                         if (force_on)
3320                                 iommu_disable_protect_mem_regions(iommu);
3321                         continue;
3322                 }
3323         
3324                 iommu_flush_write_buffer(iommu);
3325
3326                 iommu_set_root_entry(iommu);
3327
3328                 iommu->flush.flush_context(iommu, 0, 0, 0,
3329                                            DMA_CCMD_GLOBAL_INVL);
3330                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3331                                          DMA_TLB_GLOBAL_FLUSH);
3332                 if (iommu_enable_translation(iommu))
3333                         return 1;
3334                 iommu_disable_protect_mem_regions(iommu);
3335         }
3336
3337         return 0;
3338 }
3339
3340 static void iommu_flush_all(void)
3341 {
3342         struct dmar_drhd_unit *drhd;
3343         struct intel_iommu *iommu;
3344
3345         for_each_active_iommu(iommu, drhd) {
3346                 iommu->flush.flush_context(iommu, 0, 0, 0,
3347                                            DMA_CCMD_GLOBAL_INVL);
3348                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3349                                          DMA_TLB_GLOBAL_FLUSH);
3350         }
3351 }
3352
3353 static int iommu_suspend(void)
3354 {
3355         struct dmar_drhd_unit *drhd;
3356         struct intel_iommu *iommu = NULL;
3357         unsigned long flag;
3358
3359         for_each_active_iommu(iommu, drhd) {
3360                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3361                                                  GFP_ATOMIC);
3362                 if (!iommu->iommu_state)
3363                         goto nomem;
3364         }
3365
3366         iommu_flush_all();
3367
3368         for_each_active_iommu(iommu, drhd) {
3369                 iommu_disable_translation(iommu);
3370
3371                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3372
3373                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3374                         readl(iommu->reg + DMAR_FECTL_REG);
3375                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3376                         readl(iommu->reg + DMAR_FEDATA_REG);
3377                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3378                         readl(iommu->reg + DMAR_FEADDR_REG);
3379                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3380                         readl(iommu->reg + DMAR_FEUADDR_REG);
3381
3382                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3383         }
3384         return 0;
3385
3386 nomem:
3387         for_each_active_iommu(iommu, drhd)
3388                 kfree(iommu->iommu_state);
3389
3390         return -ENOMEM;
3391 }
3392
3393 static void iommu_resume(void)
3394 {
3395         struct dmar_drhd_unit *drhd;
3396         struct intel_iommu *iommu = NULL;
3397         unsigned long flag;
3398
3399         if (init_iommu_hw()) {
3400                 if (force_on)
3401                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3402                 else
3403                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3404                 return;
3405         }
3406
3407         for_each_active_iommu(iommu, drhd) {
3408
3409                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3410
3411                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3412                         iommu->reg + DMAR_FECTL_REG);
3413                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3414                         iommu->reg + DMAR_FEDATA_REG);
3415                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3416                         iommu->reg + DMAR_FEADDR_REG);
3417                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3418                         iommu->reg + DMAR_FEUADDR_REG);
3419
3420                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3421         }
3422
3423         for_each_active_iommu(iommu, drhd)
3424                 kfree(iommu->iommu_state);
3425 }
3426
3427 static struct syscore_ops iommu_syscore_ops = {
3428         .resume         = iommu_resume,
3429         .suspend        = iommu_suspend,
3430 };
3431
3432 static void __init init_iommu_pm_ops(void)
3433 {
3434         register_syscore_ops(&iommu_syscore_ops);
3435 }
3436
3437 #else
3438 static inline void init_iommu_pm_ops(void) {}
3439 #endif  /* CONFIG_PM */
3440
3441 LIST_HEAD(dmar_rmrr_units);
3442
3443 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3444 {
3445         list_add(&rmrr->list, &dmar_rmrr_units);
3446 }
3447
3448
3449 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3450 {
3451         struct acpi_dmar_reserved_memory *rmrr;
3452         struct dmar_rmrr_unit *rmrru;
3453
3454         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3455         if (!rmrru)
3456                 return -ENOMEM;
3457
3458         rmrru->hdr = header;
3459         rmrr = (struct acpi_dmar_reserved_memory *)header;
3460         rmrru->base_address = rmrr->base_address;
3461         rmrru->end_address = rmrr->end_address;
3462
3463         dmar_register_rmrr_unit(rmrru);
3464         return 0;
3465 }
3466
3467 static int __init
3468 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3469 {
3470         struct acpi_dmar_reserved_memory *rmrr;
3471         int ret;
3472
3473         rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3474         ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3475                 ((void *)rmrr) + rmrr->header.length,
3476                 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3477
3478         if (ret || (rmrru->devices_cnt == 0)) {
3479                 list_del(&rmrru->list);
3480                 kfree(rmrru);
3481         }
3482         return ret;
3483 }
3484
3485 static LIST_HEAD(dmar_atsr_units);
3486
3487 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3488 {
3489         struct acpi_dmar_atsr *atsr;
3490         struct dmar_atsr_unit *atsru;
3491
3492         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3493         atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3494         if (!atsru)
3495                 return -ENOMEM;
3496
3497         atsru->hdr = hdr;
3498         atsru->include_all = atsr->flags & 0x1;
3499
3500         list_add(&atsru->list, &dmar_atsr_units);
3501
3502         return 0;
3503 }
3504
3505 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3506 {
3507         int rc;
3508         struct acpi_dmar_atsr *atsr;
3509
3510         if (atsru->include_all)
3511                 return 0;
3512
3513         atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3514         rc = dmar_parse_dev_scope((void *)(atsr + 1),
3515                                 (void *)atsr + atsr->header.length,
3516                                 &atsru->devices_cnt, &atsru->devices,
3517                                 atsr->segment);
3518         if (rc || !atsru->devices_cnt) {
3519                 list_del(&atsru->list);
3520                 kfree(atsru);
3521         }
3522
3523         return rc;
3524 }
3525
3526 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3527 {
3528         int i;
3529         struct pci_bus *bus;
3530         struct acpi_dmar_atsr *atsr;
3531         struct dmar_atsr_unit *atsru;
3532
3533         dev = pci_physfn(dev);
3534
3535         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3536                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3537                 if (atsr->segment == pci_domain_nr(dev->bus))
3538                         goto found;
3539         }
3540
3541         return 0;
3542
3543 found:
3544         for (bus = dev->bus; bus; bus = bus->parent) {
3545                 struct pci_dev *bridge = bus->self;
3546
3547                 if (!bridge || !pci_is_pcie(bridge) ||
3548                     bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3549                         return 0;
3550
3551                 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3552                         for (i = 0; i < atsru->devices_cnt; i++)
3553                                 if (atsru->devices[i] == bridge)
3554                                         return 1;
3555                         break;
3556                 }
3557         }
3558
3559         if (atsru->include_all)
3560                 return 1;
3561
3562         return 0;
3563 }
3564
3565 int __init dmar_parse_rmrr_atsr_dev(void)
3566 {
3567         struct dmar_rmrr_unit *rmrr, *rmrr_n;
3568         struct dmar_atsr_unit *atsr, *atsr_n;
3569         int ret = 0;
3570
3571         list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3572                 ret = rmrr_parse_dev(rmrr);
3573                 if (ret)
3574                         return ret;
3575         }
3576
3577         list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3578                 ret = atsr_parse_dev(atsr);
3579                 if (ret)
3580                         return ret;
3581         }
3582
3583         return ret;
3584 }
3585
3586 /*
3587  * Here we only respond to action of unbound device from driver.
3588  *
3589  * Added device is not attached to its DMAR domain here yet. That will happen
3590  * when mapping the device to iova.
3591  */
3592 static int device_notifier(struct notifier_block *nb,
3593                                   unsigned long action, void *data)
3594 {
3595         struct device *dev = data;
3596         struct pci_dev *pdev = to_pci_dev(dev);
3597         struct dmar_domain *domain;
3598
3599         if (iommu_no_mapping(dev))
3600                 return 0;
3601
3602         domain = find_domain(pdev);
3603         if (!domain)
3604                 return 0;
3605
3606         if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3607                 domain_remove_one_dev_info(domain, pdev);
3608
3609                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3610                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3611                     list_empty(&domain->devices))
3612                         domain_exit(domain);
3613         }
3614
3615         return 0;
3616 }
3617
3618 static struct notifier_block device_nb = {
3619         .notifier_call = device_notifier,
3620 };
3621
3622 int __init intel_iommu_init(void)
3623 {
3624         int ret = 0;
3625
3626         /* VT-d is required for a TXT/tboot launch, so enforce that */
3627         force_on = tboot_force_iommu();
3628
3629         if (dmar_table_init()) {
3630                 if (force_on)
3631                         panic("tboot: Failed to initialize DMAR table\n");
3632                 return  -ENODEV;
3633         }
3634
3635         if (dmar_dev_scope_init() < 0) {
3636                 if (force_on)
3637                         panic("tboot: Failed to initialize DMAR device scope\n");
3638                 return  -ENODEV;
3639         }
3640
3641         if (no_iommu || dmar_disabled)
3642                 return -ENODEV;
3643
3644         if (iommu_init_mempool()) {
3645                 if (force_on)
3646                         panic("tboot: Failed to initialize iommu memory\n");
3647                 return  -ENODEV;
3648         }
3649
3650         if (list_empty(&dmar_rmrr_units))
3651                 printk(KERN_INFO "DMAR: No RMRR found\n");
3652
3653         if (list_empty(&dmar_atsr_units))
3654                 printk(KERN_INFO "DMAR: No ATSR found\n");
3655
3656         if (dmar_init_reserved_ranges()) {
3657                 if (force_on)
3658                         panic("tboot: Failed to reserve iommu ranges\n");
3659                 return  -ENODEV;
3660         }
3661
3662         init_no_remapping_devices();
3663
3664         ret = init_dmars();
3665         if (ret) {
3666                 if (force_on)
3667                         panic("tboot: Failed to initialize DMARs\n");
3668                 printk(KERN_ERR "IOMMU: dmar init failed\n");
3669                 put_iova_domain(&reserved_iova_list);
3670                 iommu_exit_mempool();
3671                 return ret;
3672         }
3673         printk(KERN_INFO
3674         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3675
3676         init_timer(&unmap_timer);
3677 #ifdef CONFIG_SWIOTLB
3678         swiotlb = 0;
3679 #endif
3680         dma_ops = &intel_dma_ops;
3681
3682         init_iommu_pm_ops();
3683
3684         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3685
3686         bus_register_notifier(&pci_bus_type, &device_nb);
3687
3688         intel_iommu_enabled = 1;
3689
3690         return 0;
3691 }
3692
3693 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3694                                            struct pci_dev *pdev)
3695 {
3696         struct pci_dev *tmp, *parent;
3697
3698         if (!iommu || !pdev)
3699                 return;
3700
3701         /* dependent device detach */
3702         tmp = pci_find_upstream_pcie_bridge(pdev);
3703         /* Secondary interface's bus number and devfn 0 */
3704         if (tmp) {
3705                 parent = pdev->bus->self;
3706                 while (parent != tmp) {
3707                         iommu_detach_dev(iommu, parent->bus->number,
3708                                          parent->devfn);
3709                         parent = parent->bus->self;
3710                 }
3711                 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3712                         iommu_detach_dev(iommu,
3713                                 tmp->subordinate->number, 0);
3714                 else /* this is a legacy PCI bridge */
3715                         iommu_detach_dev(iommu, tmp->bus->number,
3716                                          tmp->devfn);
3717         }
3718 }
3719
3720 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3721                                           struct pci_dev *pdev)
3722 {
3723         struct device_domain_info *info;
3724         struct intel_iommu *iommu;
3725         unsigned long flags;
3726         int found = 0;
3727         struct list_head *entry, *tmp;
3728
3729         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3730                                 pdev->devfn);
3731         if (!iommu)
3732                 return;
3733
3734         spin_lock_irqsave(&device_domain_lock, flags);
3735         list_for_each_safe(entry, tmp, &domain->devices) {
3736                 info = list_entry(entry, struct device_domain_info, link);
3737                 if (info->segment == pci_domain_nr(pdev->bus) &&
3738                     info->bus == pdev->bus->number &&
3739                     info->devfn == pdev->devfn) {
3740                         unlink_domain_info(info);
3741                         spin_unlock_irqrestore(&device_domain_lock, flags);
3742
3743                         iommu_disable_dev_iotlb(info);
3744                         iommu_detach_dev(iommu, info->bus, info->devfn);
3745                         iommu_detach_dependent_devices(iommu, pdev);
3746                         free_devinfo_mem(info);
3747
3748                         spin_lock_irqsave(&device_domain_lock, flags);
3749
3750                         if (found)
3751                                 break;
3752                         else
3753                                 continue;
3754                 }
3755
3756                 /* if there is no other devices under the same iommu
3757                  * owned by this domain, clear this iommu in iommu_bmp
3758                  * update iommu count and coherency
3759                  */
3760                 if (iommu == device_to_iommu(info->segment, info->bus,
3761                                             info->devfn))
3762                         found = 1;
3763         }
3764
3765         spin_unlock_irqrestore(&device_domain_lock, flags);
3766
3767         if (found == 0) {
3768                 unsigned long tmp_flags;
3769                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3770                 clear_bit(iommu->seq_id, domain->iommu_bmp);
3771                 domain->iommu_count--;
3772                 domain_update_iommu_cap(domain);
3773                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3774
3775                 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3776                     !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3777                         spin_lock_irqsave(&iommu->lock, tmp_flags);
3778                         clear_bit(domain->id, iommu->domain_ids);
3779                         iommu->domains[domain->id] = NULL;
3780                         spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3781                 }
3782         }
3783 }
3784
3785 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3786 {
3787         struct device_domain_info *info;
3788         struct intel_iommu *iommu;
3789         unsigned long flags1, flags2;
3790
3791         spin_lock_irqsave(&device_domain_lock, flags1);
3792         while (!list_empty(&domain->devices)) {
3793                 info = list_entry(domain->devices.next,
3794                         struct device_domain_info, link);
3795                 unlink_domain_info(info);
3796                 spin_unlock_irqrestore(&device_domain_lock, flags1);
3797
3798                 iommu_disable_dev_iotlb(info);
3799                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3800                 iommu_detach_dev(iommu, info->bus, info->devfn);
3801                 iommu_detach_dependent_devices(iommu, info->dev);
3802
3803                 /* clear this iommu in iommu_bmp, update iommu count
3804                  * and capabilities
3805                  */
3806                 spin_lock_irqsave(&domain->iommu_lock, flags2);
3807                 if (test_and_clear_bit(iommu->seq_id,
3808                                        domain->iommu_bmp)) {
3809                         domain->iommu_count--;
3810                         domain_update_iommu_cap(domain);
3811                 }
3812                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3813
3814                 free_devinfo_mem(info);
3815                 spin_lock_irqsave(&device_domain_lock, flags1);
3816         }
3817         spin_unlock_irqrestore(&device_domain_lock, flags1);
3818 }
3819
3820 /* domain id for virtual machine, it won't be set in context */
3821 static unsigned long vm_domid;
3822
3823 static struct dmar_domain *iommu_alloc_vm_domain(void)
3824 {
3825         struct dmar_domain *domain;
3826
3827         domain = alloc_domain_mem();
3828         if (!domain)
3829                 return NULL;
3830
3831         domain->id = vm_domid++;
3832         domain->nid = -1;
3833         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3834         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3835
3836         return domain;
3837 }
3838
3839 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3840 {
3841         int adjust_width;
3842
3843         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3844         spin_lock_init(&domain->iommu_lock);
3845
3846         domain_reserve_special_ranges(domain);
3847
3848         /* calculate AGAW */
3849         domain->gaw = guest_width;
3850         adjust_width = guestwidth_to_adjustwidth(guest_width);
3851         domain->agaw = width_to_agaw(adjust_width);
3852
3853         INIT_LIST_HEAD(&domain->devices);
3854
3855         domain->iommu_count = 0;
3856         domain->iommu_coherency = 0;
3857         domain->iommu_snooping = 0;
3858         domain->iommu_superpage = 0;
3859         domain->max_addr = 0;
3860         domain->nid = -1;
3861
3862         /* always allocate the top pgd */
3863         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3864         if (!domain->pgd)
3865                 return -ENOMEM;
3866         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3867         return 0;
3868 }
3869
3870 static void iommu_free_vm_domain(struct dmar_domain *domain)
3871 {
3872         unsigned long flags;
3873         struct dmar_drhd_unit *drhd;
3874         struct intel_iommu *iommu;
3875         unsigned long i;
3876         unsigned long ndomains;
3877
3878         for_each_drhd_unit(drhd) {
3879                 if (drhd->ignored)
3880                         continue;
3881                 iommu = drhd->iommu;
3882
3883                 ndomains = cap_ndoms(iommu->cap);
3884                 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3885                         if (iommu->domains[i] == domain) {
3886                                 spin_lock_irqsave(&iommu->lock, flags);
3887                                 clear_bit(i, iommu->domain_ids);
3888                                 iommu->domains[i] = NULL;
3889                                 spin_unlock_irqrestore(&iommu->lock, flags);
3890                                 break;
3891                         }
3892                 }
3893         }
3894 }
3895
3896 static void vm_domain_exit(struct dmar_domain *domain)
3897 {
3898         /* Domain 0 is reserved, so dont process it */
3899         if (!domain)
3900                 return;
3901
3902         vm_domain_remove_all_dev_info(domain);
3903         /* destroy iovas */
3904         put_iova_domain(&domain->iovad);
3905
3906         /* clear ptes */
3907         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3908
3909         /* free page tables */
3910         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3911
3912         iommu_free_vm_domain(domain);
3913         free_domain_mem(domain);
3914 }
3915
3916 static int intel_iommu_domain_init(struct iommu_domain *domain)
3917 {
3918         struct dmar_domain *dmar_domain;
3919
3920         dmar_domain = iommu_alloc_vm_domain();
3921         if (!dmar_domain) {
3922                 printk(KERN_ERR
3923                         "intel_iommu_domain_init: dmar_domain == NULL\n");
3924                 return -ENOMEM;
3925         }
3926         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3927                 printk(KERN_ERR
3928                         "intel_iommu_domain_init() failed\n");
3929                 vm_domain_exit(dmar_domain);
3930                 return -ENOMEM;
3931         }
3932         domain_update_iommu_cap(dmar_domain);
3933         domain->priv = dmar_domain;
3934
3935         return 0;
3936 }
3937
3938 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3939 {
3940         struct dmar_domain *dmar_domain = domain->priv;
3941
3942         domain->priv = NULL;
3943         vm_domain_exit(dmar_domain);
3944 }
3945
3946 static int intel_iommu_attach_device(struct iommu_domain *domain,
3947                                      struct device *dev)
3948 {
3949         struct dmar_domain *dmar_domain = domain->priv;
3950         struct pci_dev *pdev = to_pci_dev(dev);
3951         struct intel_iommu *iommu;
3952         int addr_width;
3953
3954         /* normally pdev is not mapped */
3955         if (unlikely(domain_context_mapped(pdev))) {
3956                 struct dmar_domain *old_domain;
3957
3958                 old_domain = find_domain(pdev);
3959                 if (old_domain) {
3960                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3961                             dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3962                                 domain_remove_one_dev_info(old_domain, pdev);
3963                         else
3964                                 domain_remove_dev_info(old_domain);
3965                 }
3966         }
3967
3968         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3969                                 pdev->devfn);
3970         if (!iommu)
3971                 return -ENODEV;
3972
3973         /* check if this iommu agaw is sufficient for max mapped address */
3974         addr_width = agaw_to_width(iommu->agaw);
3975         if (addr_width > cap_mgaw(iommu->cap))
3976                 addr_width = cap_mgaw(iommu->cap);
3977
3978         if (dmar_domain->max_addr > (1LL << addr_width)) {
3979                 printk(KERN_ERR "%s: iommu width (%d) is not "
3980                        "sufficient for the mapped address (%llx)\n",
3981                        __func__, addr_width, dmar_domain->max_addr);
3982                 return -EFAULT;
3983         }
3984         dmar_domain->gaw = addr_width;
3985
3986         /*
3987          * Knock out extra levels of page tables if necessary
3988          */
3989         while (iommu->agaw < dmar_domain->agaw) {
3990                 struct dma_pte *pte;
3991
3992                 pte = dmar_domain->pgd;
3993                 if (dma_pte_present(pte)) {
3994                         dmar_domain->pgd = (struct dma_pte *)
3995                                 phys_to_virt(dma_pte_addr(pte));
3996                         free_pgtable_page(pte);
3997                 }
3998                 dmar_domain->agaw--;
3999         }
4000
4001         return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4002 }
4003
4004 static void intel_iommu_detach_device(struct iommu_domain *domain,
4005                                       struct device *dev)
4006 {
4007         struct dmar_domain *dmar_domain = domain->priv;
4008         struct pci_dev *pdev = to_pci_dev(dev);
4009
4010         domain_remove_one_dev_info(dmar_domain, pdev);
4011 }
4012
4013 static int intel_iommu_map(struct iommu_domain *domain,
4014                            unsigned long iova, phys_addr_t hpa,
4015                            size_t size, int iommu_prot)
4016 {
4017         struct dmar_domain *dmar_domain = domain->priv;
4018         u64 max_addr;
4019         int prot = 0;
4020         int ret;
4021
4022         if (iommu_prot & IOMMU_READ)
4023                 prot |= DMA_PTE_READ;
4024         if (iommu_prot & IOMMU_WRITE)
4025                 prot |= DMA_PTE_WRITE;
4026         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4027                 prot |= DMA_PTE_SNP;
4028
4029         max_addr = iova + size;
4030         if (dmar_domain->max_addr < max_addr) {
4031                 u64 end;
4032
4033                 /* check if minimum agaw is sufficient for mapped address */
4034                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4035                 if (end < max_addr) {
4036                         printk(KERN_ERR "%s: iommu width (%d) is not "
4037                                "sufficient for the mapped address (%llx)\n",
4038                                __func__, dmar_domain->gaw, max_addr);
4039                         return -EFAULT;
4040                 }
4041                 dmar_domain->max_addr = max_addr;
4042         }
4043         /* Round up size to next multiple of PAGE_SIZE, if it and
4044            the low bits of hpa would take us onto the next page */
4045         size = aligned_nrpages(hpa, size);
4046         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4047                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4048         return ret;
4049 }
4050
4051 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4052                              unsigned long iova, size_t size)
4053 {
4054         struct dmar_domain *dmar_domain = domain->priv;
4055         int order;
4056
4057         order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4058                             (iova + size - 1) >> VTD_PAGE_SHIFT);
4059
4060         if (dmar_domain->max_addr == iova + size)
4061                 dmar_domain->max_addr = iova;
4062
4063         return PAGE_SIZE << order;
4064 }
4065
4066 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4067                                             unsigned long iova)
4068 {
4069         struct dmar_domain *dmar_domain = domain->priv;
4070         struct dma_pte *pte;
4071         u64 phys = 0;
4072
4073         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4074         if (pte)
4075                 phys = dma_pte_addr(pte);
4076
4077         return phys;
4078 }
4079
4080 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4081                                       unsigned long cap)
4082 {
4083         struct dmar_domain *dmar_domain = domain->priv;
4084
4085         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4086                 return dmar_domain->iommu_snooping;
4087         if (cap == IOMMU_CAP_INTR_REMAP)
4088                 return irq_remapping_enabled;
4089
4090         return 0;
4091 }
4092
4093 /*
4094  * Group numbers are arbitrary.  Device with the same group number
4095  * indicate the iommu cannot differentiate between them.  To avoid
4096  * tracking used groups we just use the seg|bus|devfn of the lowest
4097  * level we're able to differentiate devices
4098  */
4099 static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4100 {
4101         struct pci_dev *pdev = to_pci_dev(dev);
4102         struct pci_dev *bridge;
4103         union {
4104                 struct {
4105                         u8 devfn;
4106                         u8 bus;
4107                         u16 segment;
4108                 } pci;
4109                 u32 group;
4110         } id;
4111
4112         if (iommu_no_mapping(dev))
4113                 return -ENODEV;
4114
4115         id.pci.segment = pci_domain_nr(pdev->bus);
4116         id.pci.bus = pdev->bus->number;
4117         id.pci.devfn = pdev->devfn;
4118
4119         if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4120                 return -ENODEV;
4121
4122         bridge = pci_find_upstream_pcie_bridge(pdev);
4123         if (bridge) {
4124                 if (pci_is_pcie(bridge)) {
4125                         id.pci.bus = bridge->subordinate->number;
4126                         id.pci.devfn = 0;
4127                 } else {
4128                         id.pci.bus = bridge->bus->number;
4129                         id.pci.devfn = bridge->devfn;
4130                 }
4131         }
4132
4133         if (!pdev->is_virtfn && iommu_group_mf)
4134                 id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4135
4136         *groupid = id.group;
4137
4138         return 0;
4139 }
4140
4141 static struct iommu_ops intel_iommu_ops = {
4142         .domain_init    = intel_iommu_domain_init,
4143         .domain_destroy = intel_iommu_domain_destroy,
4144         .attach_dev     = intel_iommu_attach_device,
4145         .detach_dev     = intel_iommu_detach_device,
4146         .map            = intel_iommu_map,
4147         .unmap          = intel_iommu_unmap,
4148         .iova_to_phys   = intel_iommu_iova_to_phys,
4149         .domain_has_cap = intel_iommu_domain_has_cap,
4150         .device_group   = intel_iommu_device_group,
4151         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4152 };
4153
4154 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4155 {
4156         /*
4157          * Mobile 4 Series Chipset neglects to set RWBF capability,
4158          * but needs it:
4159          */
4160         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4161         rwbf_quirk = 1;
4162
4163         /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
4164         if (dev->revision == 0x07) {
4165                 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4166                 dmar_map_gfx = 0;
4167         }
4168 }
4169
4170 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4171
4172 #define GGC 0x52
4173 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4174 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4175 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4176 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4177 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4178 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4179 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4180 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4181
4182 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4183 {
4184         unsigned short ggc;
4185
4186         if (pci_read_config_word(dev, GGC, &ggc))
4187                 return;
4188
4189         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4190                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4191                 dmar_map_gfx = 0;
4192         } else if (dmar_map_gfx) {
4193                 /* we have to ensure the gfx device is idle before we flush */
4194                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4195                 intel_iommu_strict = 1;
4196        }
4197 }
4198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4199 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4200 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4201 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4202
4203 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4204    ISOCH DMAR unit for the Azalia sound device, but not give it any
4205    TLB entries, which causes it to deadlock. Check for that.  We do
4206    this in a function called from init_dmars(), instead of in a PCI
4207    quirk, because we don't want to print the obnoxious "BIOS broken"
4208    message if VT-d is actually disabled.
4209 */
4210 static void __init check_tylersburg_isoch(void)
4211 {
4212         struct pci_dev *pdev;
4213         uint32_t vtisochctrl;
4214
4215         /* If there's no Azalia in the system anyway, forget it. */
4216         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4217         if (!pdev)
4218                 return;
4219         pci_dev_put(pdev);
4220
4221         /* System Management Registers. Might be hidden, in which case
4222            we can't do the sanity check. But that's OK, because the
4223            known-broken BIOSes _don't_ actually hide it, so far. */
4224         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4225         if (!pdev)
4226                 return;
4227
4228         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4229                 pci_dev_put(pdev);
4230                 return;
4231         }
4232
4233         pci_dev_put(pdev);
4234
4235         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4236         if (vtisochctrl & 1)
4237                 return;
4238
4239         /* Drop all bits other than the number of TLB entries */
4240         vtisochctrl &= 0x1c;
4241
4242         /* If we have the recommended number of TLB entries (16), fine. */
4243         if (vtisochctrl == 0x10)
4244                 return;
4245
4246         /* Zero TLB entries? You get to ride the short bus to school. */
4247         if (!vtisochctrl) {
4248                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4249                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4250                      dmi_get_system_info(DMI_BIOS_VENDOR),
4251                      dmi_get_system_info(DMI_BIOS_VERSION),
4252                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4253                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4254                 return;
4255         }
4256         
4257         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4258                vtisochctrl);
4259 }