Merge tag 'efi-next' of git://git.kernel.org/pub/scm/linux/kernel/git/mfleming/efi...

author Ingo Molnar <mingo@kernel.org>

Tue, 26 Nov 2013 11:23:04 +0000 (12:23 +0100)

committer Ingo Molnar <mingo@kernel.org>

Tue, 26 Nov 2013 11:23:04 +0000 (12:23 +0100)
author Ingo Molnar <mingo@kernel.org>
Tue, 26 Nov 2013 11:23:04 +0000 (12:23 +0100)
committer Ingo Molnar <mingo@kernel.org>
Tue, 26 Nov 2013 11:23:04 +0000 (12:23 +0100)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt

index 50680a59a2ff9a913e449a1d71ced9fab3004fe4..e06e99303dd3f544dff67b7b657885b749b7ae5a 100644 (file)
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -890,6 +890,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
         edd=            [EDD]
                         Format: {"off" | "on" | "skip[mbr]"}
  
+       efi=            [EFI]
+                       Format: { "old_map" }
+                       old_map [X86-64]: switch to the old ioremap-based EFI
+                       runtime services mapping. 32-bit still uses this one by
+                       default.
+
         efi_no_storage_paranoia [EFI; X86]
                         Using this parameter you can use more than 50% of
                         your efi variable storage. Use this parameter only if
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt

index 881582f75c9ceb14e6eaacc48f3edecef31b3e3a..c584a51add15ad1ca8033207e46bf497abb33570 100644 (file)
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -28,4 +28,11 @@ reference.
  Current X86-64 implementations only support 40 bits of address space,
  but we support up to 46 bits. This expands into MBZ space in the page tables.
  
+->trampoline_pgd:
+
+We map EFI runtime services in the aforementioned PGD in the virtual
+range of 64Gb (arbitrarily set, can be raised if needed)
+
+0xffffffef00000000 - 0xffffffff00000000
+
  -Andi Kleen, Jul 2004
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h

index 65c6e6e3a5521bc73e843a5459b63ec99e02d689..89a05b0507b98dbbb2ed83169c59058f044704a7 100644 (file)
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -1,6 +1,24 @@
  #ifndef _ASM_X86_EFI_H
  #define _ASM_X86_EFI_H
  
+/*
+ * We map the EFI regions needed for runtime services non-contiguously,
+ * with preserved alignment on virtual addresses starting from -4G down
+ * for a total max space of 64G. This way, we provide for stable runtime
+ * services addresses across kernels so that a kexec'd kernel can still
+ * use them.
+ *
+ * This is the main reason why we're doing stable VA mappings for RT
+ * services.
+ *
+ * This flag is used in conjuction with a chicken bit called
+ * "efi=old_map" which can be used as a fallback to the old runtime
+ * services mapping method in case there's some b0rkage with a
+ * particular EFI implementation (haha, it is hard to hold up the
+ * sarcasm here...).
+ */
+#define EFI_OLD_MEMMAP         EFI_ARCH_1
+
  #ifdef CONFIG_X86_32
  
  #define EFI_LOADER_SIGNATURE   "EL32"
@@ -69,24 +87,31 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
         efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3),         \
                   (u64)(a4), (u64)(a5), (u64)(a6))
  
+#define _efi_call_virtX(x, f, ...)                                     \
+({                                                                     \
+       efi_status_t __s;                                               \
+                                                                       \
+       efi_sync_low_kernel_mappings();                                 \
+       preempt_disable();                                              \
+       __s = efi_call##x((void *)efi.systab->runtime->f, __VA_ARGS__); \
+       preempt_enable();                                               \
+       __s;                                                            \
+})
+
  #define efi_call_virt0(f)                              \
-       efi_call0((efi.systab->runtime->f))
-#define efi_call_virt1(f, a1)                                  \
-       efi_call1((efi.systab->runtime->f), (u64)(a1))
-#define efi_call_virt2(f, a1, a2)                                      \
-       efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2))
-#define efi_call_virt3(f, a1, a2, a3)                                  \
-       efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-                 (u64)(a3))
-#define efi_call_virt4(f, a1, a2, a3, a4)                              \
-       efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-                 (u64)(a3), (u64)(a4))
-#define efi_call_virt5(f, a1, a2, a3, a4, a5)                          \
-       efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-                 (u64)(a3), (u64)(a4), (u64)(a5))
-#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)                      \
-       efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
-                 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
+       _efi_call_virtX(0, f)
+#define efi_call_virt1(f, a1)                          \
+       _efi_call_virtX(1, f, (u64)(a1))
+#define efi_call_virt2(f, a1, a2)                      \
+       _efi_call_virtX(2, f, (u64)(a1), (u64)(a2))
+#define efi_call_virt3(f, a1, a2, a3)                  \
+       _efi_call_virtX(3, f, (u64)(a1), (u64)(a2), (u64)(a3))
+#define efi_call_virt4(f, a1, a2, a3, a4)              \
+       _efi_call_virtX(4, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4))
+#define efi_call_virt5(f, a1, a2, a3, a4, a5)          \
+       _efi_call_virtX(5, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5))
+#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)      \
+       _efi_call_virtX(6, f, (u64)(a1), (u64)(a2), (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
  
  extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
                                  u32 type, u64 attribute);
@@ -95,12 +120,17 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
  
  extern int add_efi_memmap;
  extern unsigned long x86_efi_facility;
+extern struct efi_scratch efi_scratch;
  extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
  extern int efi_memblock_x86_reserve_range(void);
  extern void efi_call_phys_prelog(void);
  extern void efi_call_phys_epilog(void);
  extern void efi_unmap_memmap(void);
  extern void efi_memory_uc(u64 addr, unsigned long size);
+extern void __init efi_map_region(efi_memory_desc_t *md);
+extern void efi_sync_low_kernel_mappings(void);
+extern void efi_setup_page_tables(void);
+extern void __init old_map_region(efi_memory_desc_t *md);
  
  #ifdef CONFIG_EFI
  
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h

index 0ecac257fb26cffd2bb9a6ece7b7a5976eca3c84..a83aa44bb1fb831cac7cda82a590c05ea0e47235 100644 (file)
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -382,7 +382,8 @@ static inline void update_page_count(int level, unsigned long pages) { }
   */
  extern pte_t *lookup_address(unsigned long address, unsigned int *level);
  extern phys_addr_t slow_virt_to_phys(void *__address);
-
+extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+                                  unsigned numpages, unsigned long page_flags);
  #endif /* !__ASSEMBLY__ */
  
  #endif /* _ASM_X86_PGTABLE_DEFS_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c

index bb32480c2d713d57a52747f2f6e119a70286dc07..b3b19f46c0164c7169c9259a68836afbaeae1943 100644 (file)
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -30,6 +30,7 @@
   */
  struct cpa_data {
         unsigned long   *vaddr;
+       pgd_t           *pgd;
         pgprot_t        mask_set;
         pgprot_t        mask_clr;
         int             numpages;
@@ -322,17 +323,9 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
         return prot;
  }
  
-/*
- * Lookup the page table entry for a virtual address. Return a pointer
- * to the entry and the level of the mapping.
- *
- * Note: We return pud and pmd either when the entry is marked large
- * or when the present bit is not set. Otherwise we would return a
- * pointer to a nonexisting mapping.
- */
-pte_t *lookup_address(unsigned long address, unsigned int *level)
+static pte_t *__lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
+                                     unsigned int *level)
  {
-       pgd_t *pgd = pgd_offset_k(address);
         pud_t *pud;
         pmd_t *pmd;
  
@@ -361,8 +354,31 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
  
         return pte_offset_kernel(pmd, address);
  }
+
+/*
+ * Lookup the page table entry for a virtual address. Return a pointer
+ * to the entry and the level of the mapping.
+ *
+ * Note: We return pud and pmd either when the entry is marked large
+ * or when the present bit is not set. Otherwise we would return a
+ * pointer to a nonexisting mapping.
+ */
+pte_t *lookup_address(unsigned long address, unsigned int *level)
+{
+        return __lookup_address_in_pgd(pgd_offset_k(address), address, level);
+}
  EXPORT_SYMBOL_GPL(lookup_address);
  
+static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
+                                 unsigned int *level)
+{
+        if (cpa->pgd)
+               return __lookup_address_in_pgd(cpa->pgd + pgd_index(address),
+                                              address, level);
+
+        return lookup_address(address, level);
+}
+
  /*
   * This is necessary because __pa() does not work on some
   * kinds of memory, like vmalloc() or the alloc_remap()
@@ -437,7 +453,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
          * Check for races, another CPU might have split this page
          * up already:
          */
-       tmp = lookup_address(address, &level);
+       tmp = _lookup_address_cpa(cpa, address, &level);
         if (tmp != kpte)
                 goto out_unlock;
  
@@ -543,7 +559,8 @@ out_unlock:
  }
  
  static int
-__split_large_page(pte_t *kpte, unsigned long address, struct page *base)
+__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
+                  struct page *base)
  {
         pte_t *pbase = (pte_t *)page_address(base);
         unsigned long pfn, pfninc = 1;
@@ -556,7 +573,7 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
          * Check for races, another CPU might have split this page
          * up for us already:
          */
-       tmp = lookup_address(address, &level);
+       tmp = _lookup_address_cpa(cpa, address, &level);
         if (tmp != kpte) {
                 spin_unlock(&pgd_lock);
                 return 1;
@@ -632,7 +649,8 @@ __split_large_page(pte_t *kpte, unsigned long address, struct page *base)
         return 0;
  }
  
-static int split_large_page(pte_t *kpte, unsigned long address)
+static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
+                           unsigned long address)
  {
         struct page *base;
  
@@ -644,15 +662,390 @@ static int split_large_page(pte_t *kpte, unsigned long address)
         if (!base)
                 return -ENOMEM;
  
-       if (__split_large_page(kpte, address, base))
+       if (__split_large_page(cpa, kpte, address, base))
                 __free_page(base);
  
         return 0;
  }
  
+static bool try_to_free_pte_page(pte_t *pte)
+{
+       int i;
+
+       for (i = 0; i < PTRS_PER_PTE; i++)
+               if (!pte_none(pte[i]))
+                       return false;
+
+       free_page((unsigned long)pte);
+       return true;
+}
+
+static bool try_to_free_pmd_page(pmd_t *pmd)
+{
+       int i;
+
+       for (i = 0; i < PTRS_PER_PMD; i++)
+               if (!pmd_none(pmd[i]))
+                       return false;
+
+       free_page((unsigned long)pmd);
+       return true;
+}
+
+static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+{
+       pte_t *pte = pte_offset_kernel(pmd, start);
+
+       while (start < end) {
+               set_pte(pte, __pte(0));
+
+               start += PAGE_SIZE;
+               pte++;
+       }
+
+       if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+               pmd_clear(pmd);
+               return true;
+       }
+       return false;
+}
+
+static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+                             unsigned long start, unsigned long end)
+{
+       if (unmap_pte_range(pmd, start, end))
+               if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+                       pud_clear(pud);
+}
+
+static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+{
+       pmd_t *pmd = pmd_offset(pud, start);
+
+       /*
+        * Not on a 2MB page boundary?
+        */
+       if (start & (PMD_SIZE - 1)) {
+               unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+               unsigned long pre_end = min_t(unsigned long, end, next_page);
+
+               __unmap_pmd_range(pud, pmd, start, pre_end);
+
+               start = pre_end;
+               pmd++;
+       }
+
+       /*
+        * Try to unmap in 2M chunks.
+        */
+       while (end - start >= PMD_SIZE) {
+               if (pmd_large(*pmd))
+                       pmd_clear(pmd);
+               else
+                       __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+
+               start += PMD_SIZE;
+               pmd++;
+       }
+
+       /*
+        * 4K leftovers?
+        */
+       if (start < end)
+               return __unmap_pmd_range(pud, pmd, start, end);
+
+       /*
+        * Try again to free the PMD page if haven't succeeded above.
+        */
+       if (!pud_none(*pud))
+               if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+                       pud_clear(pud);
+}
+
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+       pud_t *pud = pud_offset(pgd, start);
+
+       /*
+        * Not on a GB page boundary?
+        */
+       if (start & (PUD_SIZE - 1)) {
+               unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+               unsigned long pre_end   = min_t(unsigned long, end, next_page);
+
+               unmap_pmd_range(pud, start, pre_end);
+
+               start = pre_end;
+               pud++;
+       }
+
+       /*
+        * Try to unmap in 1G chunks?
+        */
+       while (end - start >= PUD_SIZE) {
+
+               if (pud_large(*pud))
+                       pud_clear(pud);
+               else
+                       unmap_pmd_range(pud, start, start + PUD_SIZE);
+
+               start += PUD_SIZE;
+               pud++;
+       }
+
+       /*
+        * 2M leftovers?
+        */
+       if (start < end)
+               unmap_pmd_range(pud, start, end);
+
+       /*
+        * No need to try to free the PUD page because we'll free it in
+        * populate_pgd's error path
+        */
+}
+
+static int alloc_pte_page(pmd_t *pmd)
+{
+       pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+       if (!pte)
+               return -1;
+
+       set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+       return 0;
+}
+
+static int alloc_pmd_page(pud_t *pud)
+{
+       pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+       if (!pmd)
+               return -1;
+
+       set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+       return 0;
+}
+
+static void populate_pte(struct cpa_data *cpa,
+                        unsigned long start, unsigned long end,
+                        unsigned num_pages, pmd_t *pmd, pgprot_t pgprot)
+{
+       pte_t *pte;
+
+       pte = pte_offset_kernel(pmd, start);
+
+       while (num_pages-- && start < end) {
+
+               /* deal with the NX bit */
+               if (!(pgprot_val(pgprot) & _PAGE_NX))
+                       cpa->pfn &= ~_PAGE_NX;
+
+               set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+
+               start    += PAGE_SIZE;
+               cpa->pfn += PAGE_SIZE;
+               pte++;
+       }
+}
+
+static int populate_pmd(struct cpa_data *cpa,
+                       unsigned long start, unsigned long end,
+                       unsigned num_pages, pud_t *pud, pgprot_t pgprot)
+{
+       unsigned int cur_pages = 0;
+       pmd_t *pmd;
+
+       /*
+        * Not on a 2M boundary?
+        */
+       if (start & (PMD_SIZE - 1)) {
+               unsigned long pre_end = start + (num_pages << PAGE_SHIFT);
+               unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
+
+               pre_end   = min_t(unsigned long, pre_end, next_page);
+               cur_pages = (pre_end - start) >> PAGE_SHIFT;
+               cur_pages = min_t(unsigned int, num_pages, cur_pages);
+
+               /*
+                * Need a PTE page?
+                */
+               pmd = pmd_offset(pud, start);
+               if (pmd_none(*pmd))
+                       if (alloc_pte_page(pmd))
+                               return -1;
+
+               populate_pte(cpa, start, pre_end, cur_pages, pmd, pgprot);
+
+               start = pre_end;
+       }
+
+       /*
+        * We mapped them all?
+        */
+       if (num_pages == cur_pages)
+               return cur_pages;
+
+       while (end - start >= PMD_SIZE) {
+
+               /*
+                * We cannot use a 1G page so allocate a PMD page if needed.
+                */
+               if (pud_none(*pud))
+                       if (alloc_pmd_page(pud))
+                               return -1;
+
+               pmd = pmd_offset(pud, start);
+
+               set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+               start     += PMD_SIZE;
+               cpa->pfn  += PMD_SIZE;
+               cur_pages += PMD_SIZE >> PAGE_SHIFT;
+       }
+
+       /*
+        * Map trailing 4K pages.
+        */
+       if (start < end) {
+               pmd = pmd_offset(pud, start);
+               if (pmd_none(*pmd))
+                       if (alloc_pte_page(pmd))
+                               return -1;
+
+               populate_pte(cpa, start, end, num_pages - cur_pages,
+                            pmd, pgprot);
+       }
+       return num_pages;
+}
+
+static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
+                       pgprot_t pgprot)
+{
+       pud_t *pud;
+       unsigned long end;
+       int cur_pages = 0;
+
+       end = start + (cpa->numpages << PAGE_SHIFT);
+
+       /*
+        * Not on a Gb page boundary? => map everything up to it with
+        * smaller pages.
+        */
+       if (start & (PUD_SIZE - 1)) {
+               unsigned long pre_end;
+               unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
+
+               pre_end   = min_t(unsigned long, end, next_page);
+               cur_pages = (pre_end - start) >> PAGE_SHIFT;
+               cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
+
+               pud = pud_offset(pgd, start);
+
+               /*
+                * Need a PMD page?
+                */
+               if (pud_none(*pud))
+                       if (alloc_pmd_page(pud))
+                               return -1;
+
+               cur_pages = populate_pmd(cpa, start, pre_end, cur_pages,
+                                        pud, pgprot);
+               if (cur_pages < 0)
+                       return cur_pages;
+
+               start = pre_end;
+       }
+
+       /* We mapped them all? */
+       if (cpa->numpages == cur_pages)
+               return cur_pages;
+
+       pud = pud_offset(pgd, start);
+
+       /*
+        * Map everything starting from the Gb boundary, possibly with 1G pages
+        */
+       while (end - start >= PUD_SIZE) {
+               set_pud(pud, __pud(cpa->pfn | _PAGE_PSE | massage_pgprot(pgprot)));
+
+               start     += PUD_SIZE;
+               cpa->pfn  += PUD_SIZE;
+               cur_pages += PUD_SIZE >> PAGE_SHIFT;
+               pud++;
+       }
+
+       /* Map trailing leftover */
+       if (start < end) {
+               int tmp;
+
+               pud = pud_offset(pgd, start);
+               if (pud_none(*pud))
+                       if (alloc_pmd_page(pud))
+                               return -1;
+
+               tmp = populate_pmd(cpa, start, end, cpa->numpages - cur_pages,
+                                  pud, pgprot);
+               if (tmp < 0)
+                       return cur_pages;
+
+               cur_pages += tmp;
+       }
+       return cur_pages;
+}
+
+/*
+ * Restrictions for kernel page table do not necessarily apply when mapping in
+ * an alternate PGD.
+ */
+static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
+{
+       pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
+       bool allocd_pgd = false;
+       pgd_t *pgd_entry;
+       pud_t *pud = NULL;      /* shut up gcc */
+       int ret;
+
+       pgd_entry = cpa->pgd + pgd_index(addr);
+
+       /*
+        * Allocate a PUD page and hand it down for mapping.
+        */
+       if (pgd_none(*pgd_entry)) {
+               pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+               if (!pud)
+                       return -1;
+
+               set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
+               allocd_pgd = true;
+       }
+
+       pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
+       pgprot_val(pgprot) |=  pgprot_val(cpa->mask_set);
+
+       ret = populate_pud(cpa, addr, pgd_entry, pgprot);
+       if (ret < 0) {
+               unmap_pud_range(pgd_entry, addr,
+                               addr + (cpa->numpages << PAGE_SHIFT));
+
+               if (allocd_pgd) {
+                       /*
+                        * If I allocated this PUD page, I can just as well
+                        * free it in this error path.
+                        */
+                       pgd_clear(pgd_entry);
+                       free_page((unsigned long)pud);
+               }
+               return ret;
+       }
+       cpa->numpages = ret;
+       return 0;
+}
+
  static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
                                int primary)
  {
+       if (cpa->pgd)
+               return populate_pgd(cpa, vaddr);
+
         /*
          * Ignore all non primary paths.
          */
@@ -697,7 +1090,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
         else
                 address = *cpa->vaddr;
  repeat:
-       kpte = lookup_address(address, &level);
+       kpte = _lookup_address_cpa(cpa, address, &level);
         if (!kpte)
                 return __cpa_process_fault(cpa, address, primary);
  
@@ -761,7 +1154,7 @@ repeat:
         /*
          * We have to split the large page:
          */
-       err = split_large_page(kpte, address);
+       err = split_large_page(cpa, kpte, address);
         if (!err) {
                 /*
                  * Do a global flush tlb after splitting the large page
@@ -910,6 +1303,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
         int ret, cache, checkalias;
         unsigned long baddr = 0;
  
+       memset(&cpa, 0, sizeof(cpa));
+
         /*
          * Check, if we are requested to change a not supported
          * feature:
@@ -1356,6 +1751,7 @@ static int __set_pages_p(struct page *page, int numpages)
  {
         unsigned long tempaddr = (unsigned long) page_address(page);
         struct cpa_data cpa = { .vaddr = &tempaddr,
+                               .pgd = NULL,
                                 .numpages = numpages,
                                 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
                                 .mask_clr = __pgprot(0),
@@ -1374,6 +1770,7 @@ static int __set_pages_np(struct page *page, int numpages)
  {
         unsigned long tempaddr = (unsigned long) page_address(page);
         struct cpa_data cpa = { .vaddr = &tempaddr,
+                               .pgd = NULL,
                                 .numpages = numpages,
                                 .mask_set = __pgprot(0),
                                 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
@@ -1434,6 +1831,36 @@ bool kernel_page_present(struct page *page)
  
  #endif /* CONFIG_DEBUG_PAGEALLOC */
  
+int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
+                           unsigned numpages, unsigned long page_flags)
+{
+       int retval = -EINVAL;
+
+       struct cpa_data cpa = {
+               .vaddr = &address,
+               .pfn = pfn,
+               .pgd = pgd,
+               .numpages = numpages,
+               .mask_set = __pgprot(0),
+               .mask_clr = __pgprot(0),
+               .flags = 0,
+       };
+
+       if (!(__supported_pte_mask & _PAGE_NX))
+               goto out;
+
+       if (!(page_flags & _PAGE_NX))
+               cpa.mask_clr = __pgprot(_PAGE_NX);
+
+       cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
+
+       retval = __change_page_attr_set_clr(&cpa, 0);
+       __flush_tlb_all();
+
+out:
+       return retval;
+}
+
  /*
   * The testcases use internal knowledge of the implementation that shouldn't
   * be exposed to the rest of the kernel. Include these directly here.
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c

index 92c02344a060f2dacc7997185d6fd6bc04ed225e..f8ec4dafc74e5e94011c3f092f17f4c1b3f40109 100644 (file)
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -12,6 +12,8 @@
   *     Bibo Mao <bibo.mao@intel.com>
   *     Chandramouli Narayanan <mouli@linux.intel.com>
   *     Huang Ying <ying.huang@intel.com>
+ * Copyright (C) 2013 SuSE Labs
+ *     Borislav Petkov <bp@suse.de> - runtime services VA mapping
   *
   * Copied from efi_32.c to eliminate the duplicated code between EFI
   * 32/64 support code. --ying 2007-10-26
@@ -51,7 +53,7 @@
  #include <asm/x86_init.h>
  #include <asm/rtc.h>
  
-#define EFI_DEBUG      1
+#define EFI_DEBUG
  
  #define EFI_MIN_RESERVE 5120
  
@@ -398,9 +400,9 @@ int __init efi_memblock_x86_reserve_range(void)
         return 0;
  }
  
-#if EFI_DEBUG
  static void __init print_efi_memmap(void)
  {
+#ifdef EFI_DEBUG
         efi_memory_desc_t *md;
         void *p;
         int i;
@@ -415,8 +417,8 @@ static void __init print_efi_memmap(void)
                         md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
                         (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
         }
-}
  #endif  /*  EFI_DEBUG  */
+}
  
  void __init efi_reserve_boot_services(void)
  {
@@ -696,10 +698,7 @@ void __init efi_init(void)
                 x86_platform.set_wallclock = efi_set_rtc_mmss;
         }
  #endif
-
-#if EFI_DEBUG
         print_efi_memmap();
-#endif
  }
  
  void __init efi_late_init(void)
@@ -748,21 +747,56 @@ void efi_memory_uc(u64 addr, unsigned long size)
         set_memory_uc(addr, npages);
  }
  
+void __init old_map_region(efi_memory_desc_t *md)
+{
+       u64 start_pfn, end_pfn, end;
+       unsigned long size;
+       void *va;
+
+       start_pfn = PFN_DOWN(md->phys_addr);
+       size      = md->num_pages << PAGE_SHIFT;
+       end       = md->phys_addr + size;
+       end_pfn   = PFN_UP(end);
+
+       if (pfn_range_is_mapped(start_pfn, end_pfn)) {
+               va = __va(md->phys_addr);
+
+               if (!(md->attribute & EFI_MEMORY_WB))
+                       efi_memory_uc((u64)(unsigned long)va, size);
+       } else
+               va = efi_ioremap(md->phys_addr, size,
+                                md->type, md->attribute);
+
+       md->virt_addr = (u64) (unsigned long) va;
+       if (!va)
+               pr_err("ioremap of 0x%llX failed!\n",
+                      (unsigned long long)md->phys_addr);
+}
+
  /*
   * This function will switch the EFI runtime services to virtual mode.
- * Essentially, look through the EFI memmap and map every region that
- * has the runtime attribute bit set in its memory descriptor and update
- * that memory descriptor with the virtual address obtained from ioremap().
- * This enables the runtime services to be called without having to
+ * Essentially, we look through the EFI memmap and map every region that
+ * has the runtime attribute bit set in its memory descriptor into the
+ * ->trampoline_pgd page table using a top-down VA allocation scheme.
+ *
+ * The old method which used to update that memory descriptor with the
+ * virtual address obtained from ioremap() is still supported when the
+ * kernel is booted with efi=old_map on its command line. Same old
+ * method enabled the runtime services to be called without having to
   * thunk back into physical mode for every invocation.
+ *
+ * The new method does a pagetable switch in a preemption-safe manner
+ * so that we're in a different address space when calling a runtime
+ * function. For function arguments passing we do copy the PGDs of the
+ * kernel page table into ->trampoline_pgd prior to each call.
   */
  void __init efi_enter_virtual_mode(void)
  {
         efi_memory_desc_t *md, *prev_md = NULL;
-       efi_status_t status;
+       void *p, *new_memmap = NULL;
         unsigned long size;
-       u64 end, systab, start_pfn, end_pfn;
-       void *p, *va, *new_memmap = NULL;
+       efi_status_t status;
+       u64 end, systab;
         int count = 0;
  
         efi.systab = NULL;
@@ -771,7 +805,6 @@ void __init efi_enter_virtual_mode(void)
          * We don't do virtual mode, since we don't do runtime services, on
          * non-native EFI
          */
-
         if (!efi_is_native()) {
                 efi_unmap_memmap();
                 return;
@@ -802,6 +835,7 @@ void __init efi_enter_virtual_mode(void)
                         continue;
                 }
                 prev_md = md;
+
         }
  
         for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -814,36 +848,24 @@ void __init efi_enter_virtual_mode(void)
                                 continue;
                 }
  
+               efi_map_region(md);
+
                 size = md->num_pages << EFI_PAGE_SHIFT;
                 end = md->phys_addr + size;
  
-               start_pfn = PFN_DOWN(md->phys_addr);
-               end_pfn = PFN_UP(end);
-               if (pfn_range_is_mapped(start_pfn, end_pfn)) {
-                       va = __va(md->phys_addr);
-
-                       if (!(md->attribute & EFI_MEMORY_WB))
-                               efi_memory_uc((u64)(unsigned long)va, size);
-               } else
-                       va = efi_ioremap(md->phys_addr, size,
-                                        md->type, md->attribute);
-
-               md->virt_addr = (u64) (unsigned long) va;
-
-               if (!va) {
-                       pr_err("ioremap of 0x%llX failed!\n",
-                              (unsigned long long)md->phys_addr);
-                       continue;
-               }
-
                 systab = (u64) (unsigned long) efi_phys.systab;
                 if (md->phys_addr <= systab && systab < end) {
                         systab += md->virt_addr - md->phys_addr;
+
                         efi.systab = (efi_system_table_t *) (unsigned long) systab;
                 }
+
                 new_memmap = krealloc(new_memmap,
                                       (count + 1) * memmap.desc_size,
                                       GFP_KERNEL);
+               if (!new_memmap)
+                       goto err_out;
+
                 memcpy(new_memmap + (count * memmap.desc_size), md,
                        memmap.desc_size);
                 count++;
@@ -851,6 +873,9 @@ void __init efi_enter_virtual_mode(void)
  
         BUG_ON(!efi.systab);
  
+       efi_setup_page_tables();
+       efi_sync_low_kernel_mappings();
+
         status = phys_efi_set_virtual_address_map(
                 memmap.desc_size * count,
                 memmap.desc_size,
@@ -883,7 +908,8 @@ void __init efi_enter_virtual_mode(void)
         efi.query_variable_info = virt_efi_query_variable_info;
         efi.update_capsule = virt_efi_update_capsule;
         efi.query_capsule_caps = virt_efi_query_capsule_caps;
-       if (__supported_pte_mask & _PAGE_NX)
+
+       if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
                 runtime_code_page_mkexec();
  
         kfree(new_memmap);
@@ -894,6 +920,11 @@ void __init efi_enter_virtual_mode(void)
                          EFI_VARIABLE_BOOTSERVICE_ACCESS |
                          EFI_VARIABLE_RUNTIME_ACCESS,
                          0, NULL);
+
+       return;
+
+ err_out:
+       pr_err("Error reallocating memory, EFI runtime non-functional!\n");
  }
  
  /*
@@ -1013,3 +1044,15 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
         return EFI_SUCCESS;
  }
  EXPORT_SYMBOL_GPL(efi_query_variable_store);
+
+static int __init parse_efi_cmdline(char *str)
+{
+       if (*str == '=')
+               str++;
+
+       if (!strncmp(str, "old_map", 7))
+               set_bit(EFI_OLD_MEMMAP, &x86_efi_facility);
+
+       return 0;
+}
+early_param("efi", parse_efi_cmdline);
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c

index 40e446941dd7eceb587b2b4572c96279631e5f56..e94557cf54878f1c8111a9e800a484d3880192d3 100644 (file)
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -37,9 +37,16 @@
   * claim EFI runtime service handler exclusively and to duplicate a memory in
   * low memory space say 0 - 3G.
   */
-
  static unsigned long efi_rt_eflags;
  
+void efi_sync_low_kernel_mappings(void) {}
+void efi_setup_page_tables(void) {}
+
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+       old_map_region(md);
+}
+
  void efi_call_phys_prelog(void)
  {
         struct desc_ptr gdt_descr;
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c

index 39a0e7f1f0a3ec1c7601888d30c724a5b8660e19..bf286c386d330272fdb2e4d7cef99ddf108d489c 100644 (file)
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -38,10 +38,28 @@
  #include <asm/efi.h>
  #include <asm/cacheflush.h>
  #include <asm/fixmap.h>
+#include <asm/realmode.h>
  
  static pgd_t *save_pgd __initdata;
  static unsigned long efi_flags __initdata;
  
+/*
+ * We allocate runtime services regions bottom-up, starting from -4G, i.e.
+ * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
+ */
+static u64 efi_va      = -4 * (1UL << 30);
+#define EFI_VA_END     (-68 * (1UL << 30))
+
+/*
+ * Scratch space used for switching the pagetable in the EFI stub
+ */
+struct efi_scratch {
+       u64 r15;
+       u64 prev_cr3;
+       pgd_t *efi_pgt;
+       bool use_pgd;
+};
+
  static void __init early_code_mapping_set_exec(int executable)
  {
         efi_memory_desc_t *md;
@@ -65,6 +83,9 @@ void __init efi_call_phys_prelog(void)
         int pgd;
         int n_pgds;
  
+       if (!efi_enabled(EFI_OLD_MEMMAP))
+               return;
+
         early_code_mapping_set_exec(1);
         local_irq_save(efi_flags);
  
@@ -86,6 +107,10 @@ void __init efi_call_phys_epilog(void)
          */
         int pgd;
         int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+
+       if (!efi_enabled(EFI_OLD_MEMMAP))
+               return;
+
         for (pgd = 0; pgd < n_pgds; pgd++)
                 set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
         kfree(save_pgd);
@@ -94,6 +119,90 @@ void __init efi_call_phys_epilog(void)
         early_code_mapping_set_exec(0);
  }
  
+/*
+ * Add low kernel mappings for passing arguments to EFI functions.
+ */
+void efi_sync_low_kernel_mappings(void)
+{
+       unsigned num_pgds;
+       pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+
+       if (efi_enabled(EFI_OLD_MEMMAP))
+               return;
+
+       num_pgds = pgd_index(MODULES_END - 1) - pgd_index(PAGE_OFFSET);
+
+       memcpy(pgd + pgd_index(PAGE_OFFSET),
+               init_mm.pgd + pgd_index(PAGE_OFFSET),
+               sizeof(pgd_t) * num_pgds);
+}
+
+void efi_setup_page_tables(void)
+{
+       efi_scratch.efi_pgt = (pgd_t *)(unsigned long)real_mode_header->trampoline_pgd;
+
+       if (!efi_enabled(EFI_OLD_MEMMAP))
+               efi_scratch.use_pgd = true;
+}
+
+static void __init __map_region(efi_memory_desc_t *md, u64 va)
+{
+       pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd);
+       unsigned long pf = 0, size;
+       u64 end;
+
+       if (!(md->attribute & EFI_MEMORY_WB))
+               pf |= _PAGE_PCD;
+
+       size = md->num_pages << PAGE_SHIFT;
+       end  = va + size;
+
+       if (kernel_map_pages_in_pgd(pgd, md->phys_addr, va, md->num_pages, pf))
+               pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
+                          md->phys_addr, va);
+}
+
+void __init efi_map_region(efi_memory_desc_t *md)
+{
+       unsigned long size = md->num_pages << PAGE_SHIFT;
+       u64 pa = md->phys_addr;
+
+       if (efi_enabled(EFI_OLD_MEMMAP))
+               return old_map_region(md);
+
+       /*
+        * Make sure the 1:1 mappings are present as a catch-all for b0rked
+        * firmware which doesn't update all internal pointers after switching
+        * to virtual mode and would otherwise crap on us.
+        */
+       __map_region(md, md->phys_addr);
+
+       efi_va -= size;
+
+       /* Is PA 2M-aligned? */
+       if (!(pa & (PMD_SIZE - 1))) {
+               efi_va &= PMD_MASK;
+       } else {
+               u64 pa_offset = pa & (PMD_SIZE - 1);
+               u64 prev_va = efi_va;
+
+               /* get us the same offset within this 2M page */
+               efi_va = (efi_va & PMD_MASK) + pa_offset;
+
+               if (efi_va > prev_va)
+                       efi_va -= PMD_SIZE;
+       }
+
+       if (efi_va < EFI_VA_END) {
+               pr_warn(FW_WARN "VA address range overflow!\n");
+               return;
+       }
+
+       /* Do the VA map */
+       __map_region(md, efi_va);
+       md->virt_addr = efi_va;
+}
+
  void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
                                  u32 type, u64 attribute)
  {
diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S

index 4c07ccab8146f0bbed2850f586a68f126b2bc7eb..88073b1402988b49eb6c5e95f9f801a14d40f144 100644 (file)
--- a/arch/x86/platform/efi/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
@@ -34,10 +34,47 @@
         mov %rsi, %cr0;                 \
         mov (%rsp), %rsp
  
+       /* stolen from gcc */
+       .macro FLUSH_TLB_ALL
+       movq %r15, efi_scratch(%rip)
+       movq %r14, efi_scratch+8(%rip)
+       movq %cr4, %r15
+       movq %r15, %r14
+       andb $0x7f, %r14b
+       movq %r14, %cr4
+       movq %r15, %cr4
+       movq efi_scratch+8(%rip), %r14
+       movq efi_scratch(%rip), %r15
+       .endm
+
+       .macro SWITCH_PGT
+       cmpb $0, efi_scratch+24(%rip)
+       je 1f
+       movq %r15, efi_scratch(%rip)            # r15
+       # save previous CR3
+       movq %cr3, %r15
+       movq %r15, efi_scratch+8(%rip)          # prev_cr3
+       movq efi_scratch+16(%rip), %r15         # EFI pgt
+       movq %r15, %cr3
+       1:
+       .endm
+
+       .macro RESTORE_PGT
+       cmpb $0, efi_scratch+24(%rip)
+       je 2f
+       movq efi_scratch+8(%rip), %r15
+       movq %r15, %cr3
+       movq efi_scratch(%rip), %r15
+       FLUSH_TLB_ALL
+       2:
+       .endm
+
  ENTRY(efi_call0)
         SAVE_XMM
         subq $32, %rsp
+       SWITCH_PGT
         call *%rdi
+       RESTORE_PGT
         addq $32, %rsp
         RESTORE_XMM
         ret
@@ -47,7 +84,9 @@ ENTRY(efi_call1)
         SAVE_XMM
         subq $32, %rsp
         mov  %rsi, %rcx
+       SWITCH_PGT
         call *%rdi
+       RESTORE_PGT
         addq $32, %rsp
         RESTORE_XMM
         ret
@@ -57,7 +96,9 @@ ENTRY(efi_call2)
         SAVE_XMM
         subq $32, %rsp
         mov  %rsi, %rcx
+       SWITCH_PGT
         call *%rdi
+       RESTORE_PGT
         addq $32, %rsp
         RESTORE_XMM
         ret
@@ -68,7 +109,9 @@ ENTRY(efi_call3)
         subq $32, %rsp
         mov  %rcx, %r8
         mov  %rsi, %rcx
+       SWITCH_PGT
         call *%rdi
+       RESTORE_PGT
         addq $32, %rsp
         RESTORE_XMM
         ret
@@ -80,7 +123,9 @@ ENTRY(efi_call4)
         mov %r8, %r9
         mov %rcx, %r8
         mov %rsi, %rcx
+       SWITCH_PGT
         call *%rdi
+       RESTORE_PGT
         addq $32, %rsp
         RESTORE_XMM
         ret
@@ -93,7 +138,9 @@ ENTRY(efi_call5)
         mov %r8, %r9
         mov %rcx, %r8
         mov %rsi, %rcx
+       SWITCH_PGT
         call *%rdi
+       RESTORE_PGT
         addq $48, %rsp
         RESTORE_XMM
         ret
@@ -109,8 +156,15 @@ ENTRY(efi_call6)
         mov %r8, %r9
         mov %rcx, %r8
         mov %rsi, %rcx
+       SWITCH_PGT
         call *%rdi
+       RESTORE_PGT
         addq $48, %rsp
         RESTORE_XMM
         ret
  ENDPROC(efi_call6)
+
+       .data
+ENTRY(efi_scratch)
+       .fill 3,8,0
+       .byte 0
diff --git a/include/linux/efi.h b/include/linux/efi.h

index bc5687d0f3157c9d8d39342d4e69db1734baff16..6c0ca528300c1604babea039ef4e809f4f04b74b 100644 (file)
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -653,6 +653,7 @@ extern int __init efi_setup_pcdp_console(char *);
  #define EFI_RUNTIME_SERVICES   3       /* Can we use runtime services? */
  #define EFI_MEMMAP             4       /* Can we use EFI memory map? */
  #define EFI_64BIT              5       /* Is the firmware 64-bit? */
+#define EFI_ARCH_1             6       /* First arch-specific bit */
  
  #ifdef CONFIG_EFI
  # ifdef CONFIG_X86
author	Ingo Molnar <mingo@kernel.org>
	Tue, 26 Nov 2013 11:23:04 +0000 (12:23 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Tue, 26 Nov 2013 11:23:04 +0000 (12:23 +0100)
Documentation/kernel-parameters.txt		patch \| blob \| history
Documentation/x86/x86_64/mm.txt		patch \| blob \| history
arch/x86/include/asm/efi.h		patch \| blob \| history
arch/x86/include/asm/pgtable_types.h		patch \| blob \| history
arch/x86/mm/pageattr.c		patch \| blob \| history
arch/x86/platform/efi/efi.c		patch \| blob \| history
arch/x86/platform/efi/efi_32.c		patch \| blob \| history
arch/x86/platform/efi/efi_64.c		patch \| blob \| history
arch/x86/platform/efi/efi_stub_64.S		patch \| blob \| history
include/linux/efi.h		patch \| blob \| history