Linux 3.14

[~andy/linux] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 5248fe070aa4e9f94b4be087aa8957e16cf16c1f..3bac76ae4b30ec8a62042bdff9644e87ee181d3c 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
  };
  
  int min_free_kbytes = 1024;
-int user_min_free_kbytes;
+int user_min_free_kbytes = -1;
  
  static unsigned long __meminitdata nr_kernel_pages;
  static unsigned long __meminitdata nr_all_pages;
@@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
  }
  #endif
  
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
  {
         static unsigned long resume;
         static unsigned long nr_shown;
@@ -329,7 +329,7 @@ static void bad_page(struct page *page)
  
         printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
                 current->comm, page_to_pfn(page));
-       dump_page(page);
+       dump_page_badflags(page, reason, bad_flags);
  
         print_modules();
         dump_stack();
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
         __SetPageHead(page);
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
-               __SetPageTail(p);
                 set_page_count(p, 0);
                 p->first_page = page;
+               /* Make sure p->first_page is always valid for PageTail() */
+               smp_wmb();
+               __SetPageTail(p);
         }
  }
  
@@ -383,7 +385,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
         int bad = 0;
  
         if (unlikely(compound_order(page) != order)) {
-               bad_page(page);
+               bad_page(page, "wrong compound order", 0);
                 bad++;
         }
  
@@ -392,8 +394,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
  
-               if (unlikely(!PageTail(p) || (p->first_page != page))) {
-                       bad_page(page);
+               if (unlikely(!PageTail(p))) {
+                       bad_page(page, "PageTail not set", 0);
+                       bad++;
+               } else if (unlikely(p->first_page != page)) {
+                       bad_page(page, "first_page not consistent", 0);
                         bad++;
                 }
                 __ClearPageTail(p);
@@ -506,12 +511,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                 return 0;
  
         if (page_is_guard(buddy) && page_order(buddy) == order) {
-               VM_BUG_ON(page_count(buddy) != 0);
+               VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                 return 1;
         }
  
         if (PageBuddy(buddy) && page_order(buddy) == order) {
-               VM_BUG_ON(page_count(buddy) != 0);
+               VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                 return 1;
         }
         return 0;
@@ -561,8 +566,8 @@ static inline void __free_one_page(struct page *page,
  
         page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
  
-       VM_BUG_ON(page_idx & ((1 << order) - 1));
-       VM_BUG_ON(bad_range(zone, page));
+       VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+       VM_BUG_ON_PAGE(bad_range(zone, page), page);
  
         while (order < MAX_ORDER-1) {
                 buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +623,23 @@ out:
  
  static inline int free_pages_check(struct page *page)
  {
-       if (unlikely(page_mapcount(page) |
-               (page->mapping != NULL)  |
-               (atomic_read(&page->_count) != 0) |
-               (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
-               (mem_cgroup_bad_page_check(page)))) {
-               bad_page(page);
+       char *bad_reason = NULL;
+       unsigned long bad_flags = 0;
+
+       if (unlikely(page_mapcount(page)))
+               bad_reason = "nonzero mapcount";
+       if (unlikely(page->mapping != NULL))
+               bad_reason = "non-NULL mapping";
+       if (unlikely(atomic_read(&page->_count) != 0))
+               bad_reason = "nonzero _count";
+       if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+               bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+               bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+       }
+       if (unlikely(mem_cgroup_bad_page_check(page)))
+               bad_reason = "cgroup check failed";
+       if (unlikely(bad_reason)) {
+               bad_page(page, bad_reason, bad_flags);
                 return 1;
         }
         page_cpupid_reset_last(page);
@@ -813,7 +829,7 @@ static inline void expand(struct zone *zone, struct page *page,
                 area--;
                 high--;
                 size >>= 1;
-               VM_BUG_ON(bad_range(zone, &page[size]));
+               VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
  
  #ifdef CONFIG_DEBUG_PAGEALLOC
                 if (high < debug_guardpage_minorder()) {
@@ -843,12 +859,23 @@ static inline void expand(struct zone *zone, struct page *page,
   */
  static inline int check_new_page(struct page *page)
  {
-       if (unlikely(page_mapcount(page) |
-               (page->mapping != NULL)  |
-               (atomic_read(&page->_count) != 0)  |
-               (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
-               (mem_cgroup_bad_page_check(page)))) {
-               bad_page(page);
+       char *bad_reason = NULL;
+       unsigned long bad_flags = 0;
+
+       if (unlikely(page_mapcount(page)))
+               bad_reason = "nonzero mapcount";
+       if (unlikely(page->mapping != NULL))
+               bad_reason = "non-NULL mapping";
+       if (unlikely(atomic_read(&page->_count) != 0))
+               bad_reason = "nonzero _count";
+       if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+               bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+               bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+       }
+       if (unlikely(mem_cgroup_bad_page_check(page)))
+               bad_reason = "cgroup check failed";
+       if (unlikely(bad_reason)) {
+               bad_page(page, bad_reason, bad_flags);
                 return 1;
         }
         return 0;
@@ -955,7 +982,7 @@ int move_freepages(struct zone *zone,
  
         for (page = start_page; page <= end_page;) {
                 /* Make sure we are not inadvertently changing nodes */
-               VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+               VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
  
                 if (!pfn_valid_within(page_to_pfn(page))) {
                         page++;
@@ -1211,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
         }
         local_irq_restore(flags);
  }
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+       return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
+}
+#else
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+       return false;
+}
  #endif
  
  /*
@@ -1404,8 +1440,8 @@ void split_page(struct page *page, unsigned int order)
  {
         int i;
  
-       VM_BUG_ON(PageCompound(page));
-       VM_BUG_ON(!page_count(page));
+       VM_BUG_ON_PAGE(PageCompound(page), page);
+       VM_BUG_ON_PAGE(!page_count(page), page);
  
  #ifdef CONFIG_KMEMCHECK
         /*
@@ -1547,12 +1583,18 @@ again:
                                           get_pageblock_migratetype(page));
         }
  
-       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       /*
+        * NOTE: GFP_THISNODE allocations do not partake in the kswapd
+        * aging protocol, so they can't be fair.
+        */
+       if (!gfp_thisnode_allocation(gfp_flags))
+               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+
         __count_zone_vm_events(PGALLOC, zone, 1 << order);
         zone_statistics(preferred_zone, zone, gfp_flags);
         local_irq_restore(flags);
  
-       VM_BUG_ON(bad_range(zone, page));
+       VM_BUG_ON_PAGE(bad_range(zone, page), page);
         if (prep_new_page(page, order, gfp_flags))
                 goto again;
         return page;
@@ -1919,8 +1961,12 @@ zonelist_scan:
                  * ultimately fall back to remote zones that do not
                  * partake in the fairness round-robin cycle of this
                  * zonelist.
+                *
+                * NOTE: GFP_THISNODE allocations do not partake in
+                * the kswapd aging protocol, so they can't be fair.
                  */
-               if (alloc_flags & ALLOC_WMARK_LOW) {
+               if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                   !gfp_thisnode_allocation(gfp_mask)) {
                         if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
                                 continue;
                         if (!zone_local(preferred_zone, zone))
@@ -2071,13 +2117,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
             debug_guardpage_minorder() > 0)
                 return;
  
-       /*
-        * Walking all memory to count page types is very expensive and should
-        * be inhibited in non-blockable contexts.
-        */
-       if (!(gfp_mask & __GFP_WAIT))
-               filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-
         /*
          * This documents exceptions given to allocations in certain
          * contexts that are allowed to allocate outside current's set
@@ -2242,10 +2281,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                 preferred_zone, migratetype);
                 if (page) {
                         preferred_zone->compact_blockskip_flush = false;
-                       preferred_zone->compact_considered = 0;
-                       preferred_zone->compact_defer_shift = 0;
-                       if (order >= preferred_zone->compact_order_failed)
-                               preferred_zone->compact_order_failed = order + 1;
+                       compaction_defer_reset(preferred_zone, order, true);
                         count_vm_event(COMPACTSUCCESS);
                         return page;
                 }
@@ -2486,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
          * allowed per node queues are empty and that nodes are
          * over allocated.
          */
-       if (IS_ENABLED(CONFIG_NUMA) &&
-                       (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+       if (gfp_thisnode_allocation(gfp_mask))
                 goto nopage;
  
  restart:
@@ -2535,8 +2570,15 @@ rebalance:
         }
  
         /* Atomic allocations - we can't balance anything */
-       if (!wait)
+       if (!wait) {
+               /*
+                * All existing users of the deprecated __GFP_NOFAIL are
+                * blockable, so warn of any new users that actually allow this
+                * type of allocation to fail.
+                */
+               WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
                 goto nopage;
+       }
  
         /* Avoid recursion of direct reclaim */
         if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3943,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
         struct page *page;
         unsigned long block_migratetype;
         int reserve;
+       int old_reserve;
  
         /*
          * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3965,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
          * future allocation of hugepages at runtime.
          */
         reserve = min(2, reserve);
+       old_reserve = zone->nr_migrate_reserve_block;
+
+       /* When memory hot-add, we almost always need to do nothing */
+       if (reserve == old_reserve)
+               return;
+       zone->nr_migrate_reserve_block = reserve;
  
         for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                 if (!pfn_valid(pfn))
@@ -3959,6 +4008,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                                 reserve--;
                                 continue;
                         }
+               } else if (!old_reserve) {
+                       /*
+                        * At boot time we don't need to scan the whole zone
+                        * for turning off MIGRATE_RESERVE.
+                        */
+                       break;
                 }
  
                 /*
@@ -4209,7 +4264,6 @@ static noinline __init_refok
  int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  {
         int i;
-       struct pglist_data *pgdat = zone->zone_pgdat;
         size_t alloc_size;
  
         /*
@@ -4225,7 +4279,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  
         if (!slab_is_available()) {
                 zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node_nopanic(pgdat, alloc_size);
+                       memblock_virt_alloc_node_nopanic(
+                               alloc_size, zone->zone_pgdat->node_id);
         } else {
                 /*
                  * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4400,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
  #endif
  
  /**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
   *
   * If an architecture guarantees that all ranges registered with
   * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
   */
  void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
  {
@@ -4363,9 +4419,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
                 end_pfn = min(end_pfn, max_low_pfn);
  
                 if (start_pfn < end_pfn)
-                       free_bootmem_node(NODE_DATA(this_nid),
-                                         PFN_PHYS(start_pfn),
-                                         (end_pfn - start_pfn) << PAGE_SHIFT);
+                       memblock_free_early_nid(PFN_PHYS(start_pfn),
+                                       (end_pfn - start_pfn) << PAGE_SHIFT,
+                                       this_nid);
         }
  }
  
@@ -4636,8 +4692,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
         unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
         zone->pageblock_flags = NULL;
         if (usemapsize)
-               zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
-                                                                  usemapsize);
+               zone->pageblock_flags =
+                       memblock_virt_alloc_node_nopanic(usemapsize,
+                                                        pgdat->node_id);
  }
  #else
  static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4888,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 size =  (end - start) * sizeof(struct page);
                 map = alloc_remap(pgdat->node_id, size);
                 if (!map)
-                       map = alloc_bootmem_node_nopanic(pgdat, size);
+                       map = memblock_virt_alloc_node_nopanic(size,
+                                                              pgdat->node_id);
                 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
         }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5070,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
         nodemask_t saved_node_state = node_states[N_MEMORY];
         unsigned long totalpages = early_calculate_totalpages();
         int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+       struct memblock_type *type = &memblock.memory;
+
+       /* Need to find movable_zone earlier when movable_node is specified. */
+       find_usable_zone_for_movable();
  
         /*
-        * If movablecore was specified, calculate what size of
+        * If movable_node is specified, ignore kernelcore and movablecore
+        * options.
+        */
+       if (movable_node_is_enabled()) {
+               for (i = 0; i < type->cnt; i++) {
+                       if (!memblock_is_hotpluggable(&type->regions[i]))
+                               continue;
+
+                       nid = type->regions[i].nid;
+
+                       usable_startpfn = PFN_DOWN(type->regions[i].base);
+                       zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                               min(usable_startpfn, zone_movable_pfn[nid]) :
+                               usable_startpfn;
+               }
+
+               goto out2;
+       }
+
+       /*
+        * If movablecore=nn[KMG] was specified, calculate what size of
          * kernelcore that corresponds so that memory usable for
          * any allocation type is evenly spread. If both kernelcore
          * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5122,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                 goto out;
  
         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-       find_usable_zone_for_movable();
         usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
  
  restart:
@@ -5131,6 +5212,7 @@ restart:
         if (usable_nodes && required_kernelcore > usable_nodes)
                 goto restart;
  
+out2:
         /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
         for (nid = 0; nid < MAX_NUMNODES; nid++)
                 zone_movable_pfn[nid] =
@@ -5692,7 +5774,12 @@ module_init(init_per_zone_wmark_min)
  int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
         void __user *buffer, size_t *length, loff_t *ppos)
  {
-       proc_dointvec(table, write, buffer, length, ppos);
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
         if (write) {
                 user_min_free_kbytes = min_free_kbytes;
                 setup_per_zone_wmarks();
@@ -5857,7 +5944,7 @@ void *__init alloc_large_system_hash(const char *tablename,
         do {
                 size = bucketsize << log2qty;
                 if (flags & HASH_EARLY)
-                       table = alloc_bootmem_nopanic(size);
+                       table = memblock_virt_alloc_nopanic(size, 0);
                 else if (hashdist)
                         table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                 else {
@@ -5959,7 +6046,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
         pfn = page_to_pfn(page);
         bitmap = get_pageblock_bitmap(zone, pfn);
         bitidx = pfn_to_bitidx(zone, pfn);
-       VM_BUG_ON(!zone_spans_pfn(zone, pfn));
+       VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
  
         for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
                 if (flags & value)
@@ -6457,12 +6544,24 @@ static void dump_page_flags(unsigned long flags)
         printk(")\n");
  }
  
-void dump_page(struct page *page)
+void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
  {
         printk(KERN_ALERT
                "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
                 page, atomic_read(&page->_count), page_mapcount(page),
                 page->mapping, page->index);
         dump_page_flags(page->flags);
+       if (reason)
+               pr_alert("page dumped because: %s\n", reason);
+       if (page->flags & badflags) {
+               pr_alert("bad because of flags:\n");
+               dump_page_flags(page->flags & badflags);
+       }
         mem_cgroup_print_bad_page(page);
  }
+
+void dump_page(struct page *page, char *reason)
+{
+       dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL_GPL(dump_page);