dup_mnt_ns(): get rid of pointless grabbing of vfsmount_lock

[~andy/linux] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index dacf0d2256d9790c669b88867a5c277b7a423991..b49579c7f2a550462c334f97907f2fc131a65e00 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
  #include <linux/rmap.h>
  #include <linux/swap.h>
  #include <linux/swapops.h>
+#include <linux/page-isolation.h>
  
  #include <asm/page.h>
  #include <asm/pgtable.h>
@@ -33,7 +34,6 @@
  #include "internal.h"
  
  const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
  unsigned long hugepages_treat_as_movable;
  
  int hugetlb_max_hstate __read_mostly;
@@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages;
  static unsigned long __initdata default_hstate_size;
  
  /*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
+ * free_huge_pages, and surplus_huge_pages.
   */
  DEFINE_SPINLOCK(hugetlb_lock);
  
@@ -443,10 +444,23 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
  }
  
  /* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma, long chg)
  {
-       if (vma->vm_flags & VM_NORESERVE)
-               return 0;
+       if (vma->vm_flags & VM_NORESERVE) {
+               /*
+                * This address is already reserved by other process(chg == 0),
+                * so, we should decrement reserved count. Without decrementing,
+                * reserve count remains after releasing inode, because this
+                * allocated page will go into page cache and is regarded as
+                * coming from reserved pool in releasing step.  Currently, we
+                * don't have any other solution to deal with this situation
+                * properly, so add work-around here.
+                */
+               if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+                       return 1;
+               else
+                       return 0;
+       }
  
         /* Shared mappings always use reserves */
         if (vma->vm_flags & VM_MAYSHARE)
@@ -508,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
  {
         struct page *page;
  
-       if (list_empty(&h->hugepage_freelists[nid]))
+       list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
+               if (!is_migrate_isolate_page(page))
+                       break;
+       /*
+        * if 'non-isolated free hugepage' not found on the list,
+        * the allocation fails.
+        */
+       if (&h->hugepage_freelists[nid] == &page->lru)
                 return NULL;
-       page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
         list_move(&page->lru, &h->hugepage_activelist);
         set_page_refcounted(page);
         h->free_huge_pages--;
@@ -518,9 +538,19 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
         return page;
  }
  
+/* Movability of hugepages depends on migration support. */
+static inline gfp_t htlb_alloc_mask(struct hstate *h)
+{
+       if (hugepages_treat_as_movable || hugepage_migration_support(h))
+               return GFP_HIGHUSER_MOVABLE;
+       else
+               return GFP_HIGHUSER;
+}
+
  static struct page *dequeue_huge_page_vma(struct hstate *h,
                                 struct vm_area_struct *vma,
-                               unsigned long address, int avoid_reserve)
+                               unsigned long address, int avoid_reserve,
+                               long chg)
  {
         struct page *page = NULL;
         struct mempolicy *mpol;
@@ -535,7 +565,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
          * have no page reserves. This check ensures that reservations are
          * not "stolen". The child may still get SIGKILLed
          */
-       if (!vma_has_reserves(vma) &&
+       if (!vma_has_reserves(vma, chg) &&
                         h->free_huge_pages - h->resv_huge_pages == 0)
                 goto err;
  
@@ -546,15 +576,20 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
  retry_cpuset:
         cpuset_mems_cookie = get_mems_allowed();
         zonelist = huge_zonelist(vma, address,
-                                       htlb_alloc_mask, &mpol, &nodemask);
+                                       htlb_alloc_mask(h), &mpol, &nodemask);
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                 MAX_NR_ZONES - 1, nodemask) {
-               if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+               if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
                         page = dequeue_huge_page_node(h, zone_to_nid(zone));
                         if (page) {
-                               if (!avoid_reserve && vma_has_reserves(vma))
-                                       h->resv_huge_pages--;
+                               if (avoid_reserve)
+                                       break;
+                               if (!vma_has_reserves(vma, chg))
+                                       break;
+
+                               SetPagePrivate(page);
+                               h->resv_huge_pages--;
                                 break;
                         }
                 }
@@ -611,15 +646,20 @@ static void free_huge_page(struct page *page)
         int nid = page_to_nid(page);
         struct hugepage_subpool *spool =
                 (struct hugepage_subpool *)page_private(page);
+       bool restore_reserve;
  
         set_page_private(page, 0);
         page->mapping = NULL;
         BUG_ON(page_count(page));
         BUG_ON(page_mapcount(page));
+       restore_reserve = PagePrivate(page);
  
         spin_lock(&hugetlb_lock);
         hugetlb_cgroup_uncharge_page(hstate_index(h),
                                      pages_per_huge_page(h), page);
+       if (restore_reserve)
+               h->resv_huge_pages++;
+
         if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
                 /* remove the page from active list */
                 list_del(&page->lru);
@@ -706,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
                 return NULL;
  
         page = alloc_pages_exact_node(nid,
-               htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+               htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                                                 __GFP_REPEAT|__GFP_NOWARN,
                 huge_page_order(h));
         if (page) {
@@ -853,6 +893,44 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
         return ret;
  }
  
+/*
+ * Dissolve a given free hugepage into free buddy pages. This function does
+ * nothing for in-use (including surplus) hugepages.
+ */
+static void dissolve_free_huge_page(struct page *page)
+{
+       spin_lock(&hugetlb_lock);
+       if (PageHuge(page) && !page_count(page)) {
+               struct hstate *h = page_hstate(page);
+               int nid = page_to_nid(page);
+               list_del(&page->lru);
+               h->free_huge_pages--;
+               h->free_huge_pages_node[nid]--;
+               update_and_free_page(h, page);
+       }
+       spin_unlock(&hugetlb_lock);
+}
+
+/*
+ * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
+ * make specified memory blocks removable from the system.
+ * Note that start_pfn should aligned with (minimum) hugepage size.
+ */
+void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+       unsigned int order = 8 * sizeof(void *);
+       unsigned long pfn;
+       struct hstate *h;
+
+       /* Set scan step to minimum hugepage size */
+       for_each_hstate(h)
+               if (order > huge_page_order(h))
+                       order = huge_page_order(h);
+       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
+       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+               dissolve_free_huge_page(pfn_to_page(pfn));
+}
+
  static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
  {
         struct page *page;
@@ -895,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
         spin_unlock(&hugetlb_lock);
  
         if (nid == NUMA_NO_NODE)
-               page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+               page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
                                    __GFP_REPEAT|__GFP_NOWARN,
                                    huge_page_order(h));
         else
                 page = alloc_pages_exact_node(nid,
-                       htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+                       htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                         __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
  
         if (page && arch_prepare_hugepage(page)) {
@@ -937,10 +1015,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
   */
  struct page *alloc_huge_page_node(struct hstate *h, int nid)
  {
-       struct page *page;
+       struct page *page = NULL;
  
         spin_lock(&hugetlb_lock);
-       page = dequeue_huge_page_node(h, nid);
+       if (h->free_huge_pages - h->resv_huge_pages > 0)
+               page = dequeue_huge_page_node(h, nid);
         spin_unlock(&hugetlb_lock);
  
         if (!page)
@@ -1096,9 +1175,9 @@ static long vma_needs_reservation(struct hstate *h,
         } else  {
                 long err;
                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-               struct resv_map *reservations = vma_resv_map(vma);
+               struct resv_map *resv = vma_resv_map(vma);
  
-               err = region_chg(&reservations->regions, idx, idx + 1);
+               err = region_chg(&resv->regions, idx, idx + 1);
                 if (err < 0)
                         return err;
                 return 0;
@@ -1116,10 +1195,10 @@ static void vma_commit_reservation(struct hstate *h,
  
         } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-               struct resv_map *reservations = vma_resv_map(vma);
+               struct resv_map *resv = vma_resv_map(vma);
  
                 /* Mark this page used in the map. */
-               region_add(&reservations->regions, idx, idx + 1);
+               region_add(&resv->regions, idx, idx + 1);
         }
  }
  
@@ -1145,17 +1224,18 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         chg = vma_needs_reservation(h, vma, addr);
         if (chg < 0)
                 return ERR_PTR(-ENOMEM);
-       if (chg)
-               if (hugepage_subpool_get_pages(spool, chg))
+       if (chg || avoid_reserve)
+               if (hugepage_subpool_get_pages(spool, 1))
                         return ERR_PTR(-ENOSPC);
  
         ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
         if (ret) {
-               hugepage_subpool_put_pages(spool, chg);
+               if (chg || avoid_reserve)
+                       hugepage_subpool_put_pages(spool, 1);
                 return ERR_PTR(-ENOSPC);
         }
         spin_lock(&hugetlb_lock);
-       page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
+       page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
         if (!page) {
                 spin_unlock(&hugetlb_lock);
                 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
@@ -1163,7 +1243,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
                         hugetlb_cgroup_uncharge_cgroup(idx,
                                                        pages_per_huge_page(h),
                                                        h_cg);
-                       hugepage_subpool_put_pages(spool, chg);
+                       if (chg || avoid_reserve)
+                               hugepage_subpool_put_pages(spool, 1);
                         return ERR_PTR(-ENOSPC);
                 }
                 spin_lock(&hugetlb_lock);
@@ -1179,6 +1260,20 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         return page;
  }
  
+/*
+ * alloc_huge_page()'s wrapper which simply returns the page if allocation
+ * succeeds, otherwise NULL. This function is called from new_vma_page(),
+ * where no ERR_VALUE is expected to be returned.
+ */
+struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
+                               unsigned long addr, int avoid_reserve)
+{
+       struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
+       if (IS_ERR(page))
+               page = NULL;
+       return page;
+}
+
  int __weak alloc_bootmem_huge_page(struct hstate *h)
  {
         struct huge_bootmem_page *m;
@@ -2030,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
  }
  #endif /* CONFIG_NUMA */
  
-int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-                       void __user *buffer,
-                       size_t *length, loff_t *ppos)
-{
-       proc_dointvec(table, write, buffer, length, ppos);
-       if (hugepages_treat_as_movable)
-               htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
-       else
-               htlb_alloc_mask = GFP_HIGHUSER;
-       return 0;
-}
-
  int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                         void __user *buffer,
                         size_t *length, loff_t *ppos)
@@ -2169,7 +2252,7 @@ out:
  
  static void hugetlb_vm_op_open(struct vm_area_struct *vma)
  {
-       struct resv_map *reservations = vma_resv_map(vma);
+       struct resv_map *resv = vma_resv_map(vma);
  
         /*
          * This new VMA should share its siblings reservation map if present.
@@ -2179,34 +2262,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
          * after this open call completes.  It is therefore safe to take a
          * new reference here without additional locking.
          */
-       if (reservations)
-               kref_get(&reservations->refs);
+       if (resv)
+               kref_get(&resv->refs);
  }
  
  static void resv_map_put(struct vm_area_struct *vma)
  {
-       struct resv_map *reservations = vma_resv_map(vma);
+       struct resv_map *resv = vma_resv_map(vma);
  
-       if (!reservations)
+       if (!resv)
                 return;
-       kref_put(&reservations->refs, resv_map_release);
+       kref_put(&resv->refs, resv_map_release);
  }
  
  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  {
         struct hstate *h = hstate_vma(vma);
-       struct resv_map *reservations = vma_resv_map(vma);
+       struct resv_map *resv = vma_resv_map(vma);
         struct hugepage_subpool *spool = subpool_vma(vma);
         unsigned long reserve;
         unsigned long start;
         unsigned long end;
  
-       if (reservations) {
+       if (resv) {
                 start = vma_hugecache_offset(h, vma, vma->vm_start);
                 end = vma_hugecache_offset(h, vma, vma->vm_end);
  
                 reserve = (end - start) -
-                       region_count(&reservations->regions, start, end);
+                       region_count(&resv->regions, start, end);
  
                 resv_map_put(vma);
  
@@ -2543,8 +2626,7 @@ retry_avoidcopy:
          * at the time of fork() could consume its reserves on COW instead
          * of the full address range.
          */
-       if (!(vma->vm_flags & VM_MAYSHARE) &&
-                       is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+       if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
                         old_page != pagecache_page)
                 outside_reserve = 1;
  
@@ -2616,6 +2698,8 @@ retry_avoidcopy:
         spin_lock(&mm->page_table_lock);
         ptep = huge_pte_offset(mm, address & huge_page_mask(h));
         if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+               ClearPagePrivate(new_page);
+
                 /* Break COW */
                 huge_ptep_clear_flush(vma, address, ptep);
                 set_huge_pte_at(mm, address, ptep,
@@ -2627,10 +2711,11 @@ retry_avoidcopy:
         }
         spin_unlock(&mm->page_table_lock);
         mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-       /* Caller expects lock to be held */
-       spin_lock(&mm->page_table_lock);
         page_cache_release(new_page);
         page_cache_release(old_page);
+
+       /* Caller expects lock to be held */
+       spin_lock(&mm->page_table_lock);
         return 0;
  }
  
@@ -2726,6 +2811,7 @@ retry:
                                         goto retry;
                                 goto out;
                         }
+                       ClearPagePrivate(page);
  
                         spin_lock(&inode->i_lock);
                         inode->i_blocks += blocks_per_huge_page(h);
@@ -2772,8 +2858,10 @@ retry:
         if (!huge_pte_none(huge_ptep_get(ptep)))
                 goto backout;
  
-       if (anon_rmap)
+       if (anon_rmap) {
+               ClearPagePrivate(page);
                 hugepage_add_new_anon_rmap(page, vma, address);
+       }
         else
                 page_dup_rmap(page);
         new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -3390,3 +3478,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
         return ret;
  }
  #endif
+
+bool isolate_huge_page(struct page *page, struct list_head *list)
+{
+       VM_BUG_ON(!PageHead(page));
+       if (!get_page_unless_zero(page))
+               return false;
+       spin_lock(&hugetlb_lock);
+       list_move_tail(&page->lru, list);
+       spin_unlock(&hugetlb_lock);
+       return true;
+}
+
+void putback_active_hugepage(struct page *page)
+{
+       VM_BUG_ON(!PageHead(page));
+       spin_lock(&hugetlb_lock);
+       list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+       spin_unlock(&hugetlb_lock);
+       put_page(page);
+}
+
+bool is_hugepage_active(struct page *page)
+{
+       VM_BUG_ON(!PageHuge(page));
+       /*
+        * This function can be called for a tail page because the caller,
+        * scan_movable_pages, scans through a given pfn-range which typically
+        * covers one memory block. In systems using gigantic hugepage (1GB
+        * for x86_64,) a hugepage is larger than a memory block, and we don't
+        * support migrating such large hugepages for now, so return false
+        * when called for tail pages.
+        */
+       if (PageTail(page))
+               return false;
+       /*
+        * Refcount of a hwpoisoned hugepages is 1, but they are not active,
+        * so we should return false for them.
+        */
+       if (unlikely(PageHWPoison(page)))
+               return false;
+       return page_count(page) > 0;
+}