]> Pileus Git - ~andy/linux/blobdiff - mm/page_alloc.c
Linux 3.14
[~andy/linux] / mm / page_alloc.c
index 5248fe070aa4e9f94b4be087aa8957e16cf16c1f..3bac76ae4b30ec8a62042bdff9644e87ee181d3c 100644 (file)
@@ -205,7 +205,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 };
 
 int min_free_kbytes = 1024;
-int user_min_free_kbytes;
+int user_min_free_kbytes = -1;
 
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -295,7 +295,7 @@ static inline int bad_range(struct zone *zone, struct page *page)
 }
 #endif
 
-static void bad_page(struct page *page)
+static void bad_page(struct page *page, char *reason, unsigned long bad_flags)
 {
        static unsigned long resume;
        static unsigned long nr_shown;
@@ -329,7 +329,7 @@ static void bad_page(struct page *page)
 
        printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
                current->comm, page_to_pfn(page));
-       dump_page(page);
+       dump_page_badflags(page, reason, bad_flags);
 
        print_modules();
        dump_stack();
@@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order)
        __SetPageHead(page);
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
-               __SetPageTail(p);
                set_page_count(p, 0);
                p->first_page = page;
+               /* Make sure p->first_page is always valid for PageTail() */
+               smp_wmb();
+               __SetPageTail(p);
        }
 }
 
@@ -383,7 +385,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        int bad = 0;
 
        if (unlikely(compound_order(page) != order)) {
-               bad_page(page);
+               bad_page(page, "wrong compound order", 0);
                bad++;
        }
 
@@ -392,8 +394,11 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        for (i = 1; i < nr_pages; i++) {
                struct page *p = page + i;
 
-               if (unlikely(!PageTail(p) || (p->first_page != page))) {
-                       bad_page(page);
+               if (unlikely(!PageTail(p))) {
+                       bad_page(page, "PageTail not set", 0);
+                       bad++;
+               } else if (unlikely(p->first_page != page)) {
+                       bad_page(page, "first_page not consistent", 0);
                        bad++;
                }
                __ClearPageTail(p);
@@ -506,12 +511,12 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                return 0;
 
        if (page_is_guard(buddy) && page_order(buddy) == order) {
-               VM_BUG_ON(page_count(buddy) != 0);
+               VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
 
        if (PageBuddy(buddy) && page_order(buddy) == order) {
-               VM_BUG_ON(page_count(buddy) != 0);
+               VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
        return 0;
@@ -561,8 +566,8 @@ static inline void __free_one_page(struct page *page,
 
        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
 
-       VM_BUG_ON(page_idx & ((1 << order) - 1));
-       VM_BUG_ON(bad_range(zone, page));
+       VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
+       VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
        while (order < MAX_ORDER-1) {
                buddy_idx = __find_buddy_index(page_idx, order);
@@ -618,12 +623,23 @@ out:
 
 static inline int free_pages_check(struct page *page)
 {
-       if (unlikely(page_mapcount(page) |
-               (page->mapping != NULL)  |
-               (atomic_read(&page->_count) != 0) |
-               (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
-               (mem_cgroup_bad_page_check(page)))) {
-               bad_page(page);
+       char *bad_reason = NULL;
+       unsigned long bad_flags = 0;
+
+       if (unlikely(page_mapcount(page)))
+               bad_reason = "nonzero mapcount";
+       if (unlikely(page->mapping != NULL))
+               bad_reason = "non-NULL mapping";
+       if (unlikely(atomic_read(&page->_count) != 0))
+               bad_reason = "nonzero _count";
+       if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
+               bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
+               bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+       }
+       if (unlikely(mem_cgroup_bad_page_check(page)))
+               bad_reason = "cgroup check failed";
+       if (unlikely(bad_reason)) {
+               bad_page(page, bad_reason, bad_flags);
                return 1;
        }
        page_cpupid_reset_last(page);
@@ -813,7 +829,7 @@ static inline void expand(struct zone *zone, struct page *page,
                area--;
                high--;
                size >>= 1;
-               VM_BUG_ON(bad_range(zone, &page[size]));
+               VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if (high < debug_guardpage_minorder()) {
@@ -843,12 +859,23 @@ static inline void expand(struct zone *zone, struct page *page,
  */
 static inline int check_new_page(struct page *page)
 {
-       if (unlikely(page_mapcount(page) |
-               (page->mapping != NULL)  |
-               (atomic_read(&page->_count) != 0)  |
-               (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
-               (mem_cgroup_bad_page_check(page)))) {
-               bad_page(page);
+       char *bad_reason = NULL;
+       unsigned long bad_flags = 0;
+
+       if (unlikely(page_mapcount(page)))
+               bad_reason = "nonzero mapcount";
+       if (unlikely(page->mapping != NULL))
+               bad_reason = "non-NULL mapping";
+       if (unlikely(atomic_read(&page->_count) != 0))
+               bad_reason = "nonzero _count";
+       if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
+               bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
+               bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
+       }
+       if (unlikely(mem_cgroup_bad_page_check(page)))
+               bad_reason = "cgroup check failed";
+       if (unlikely(bad_reason)) {
+               bad_page(page, bad_reason, bad_flags);
                return 1;
        }
        return 0;
@@ -955,7 +982,7 @@ int move_freepages(struct zone *zone,
 
        for (page = start_page; page <= end_page;) {
                /* Make sure we are not inadvertently changing nodes */
-               VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
+               VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
 
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
@@ -1211,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
        }
        local_irq_restore(flags);
 }
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+       return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
+}
+#else
+static bool gfp_thisnode_allocation(gfp_t gfp_mask)
+{
+       return false;
+}
 #endif
 
 /*
@@ -1404,8 +1440,8 @@ void split_page(struct page *page, unsigned int order)
 {
        int i;
 
-       VM_BUG_ON(PageCompound(page));
-       VM_BUG_ON(!page_count(page));
+       VM_BUG_ON_PAGE(PageCompound(page), page);
+       VM_BUG_ON_PAGE(!page_count(page), page);
 
 #ifdef CONFIG_KMEMCHECK
        /*
@@ -1547,12 +1583,18 @@ again:
                                          get_pageblock_migratetype(page));
        }
 
-       __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+       /*
+        * NOTE: GFP_THISNODE allocations do not partake in the kswapd
+        * aging protocol, so they can't be fair.
+        */
+       if (!gfp_thisnode_allocation(gfp_flags))
+               __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
 
-       VM_BUG_ON(bad_range(zone, page));
+       VM_BUG_ON_PAGE(bad_range(zone, page), page);
        if (prep_new_page(page, order, gfp_flags))
                goto again;
        return page;
@@ -1919,8 +1961,12 @@ zonelist_scan:
                 * ultimately fall back to remote zones that do not
                 * partake in the fairness round-robin cycle of this
                 * zonelist.
+                *
+                * NOTE: GFP_THISNODE allocations do not partake in
+                * the kswapd aging protocol, so they can't be fair.
                 */
-               if (alloc_flags & ALLOC_WMARK_LOW) {
+               if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                   !gfp_thisnode_allocation(gfp_mask)) {
                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
                                continue;
                        if (!zone_local(preferred_zone, zone))
@@ -2071,13 +2117,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
            debug_guardpage_minorder() > 0)
                return;
 
-       /*
-        * Walking all memory to count page types is very expensive and should
-        * be inhibited in non-blockable contexts.
-        */
-       if (!(gfp_mask & __GFP_WAIT))
-               filter |= SHOW_MEM_FILTER_PAGE_COUNT;
-
        /*
         * This documents exceptions given to allocations in certain
         * contexts that are allowed to allocate outside current's set
@@ -2242,10 +2281,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                                preferred_zone, migratetype);
                if (page) {
                        preferred_zone->compact_blockskip_flush = false;
-                       preferred_zone->compact_considered = 0;
-                       preferred_zone->compact_defer_shift = 0;
-                       if (order >= preferred_zone->compact_order_failed)
-                               preferred_zone->compact_order_failed = order + 1;
+                       compaction_defer_reset(preferred_zone, order, true);
                        count_vm_event(COMPACTSUCCESS);
                        return page;
                }
@@ -2486,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * allowed per node queues are empty and that nodes are
         * over allocated.
         */
-       if (IS_ENABLED(CONFIG_NUMA) &&
-                       (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+       if (gfp_thisnode_allocation(gfp_mask))
                goto nopage;
 
 restart:
@@ -2535,8 +2570,15 @@ rebalance:
        }
 
        /* Atomic allocations - we can't balance anything */
-       if (!wait)
+       if (!wait) {
+               /*
+                * All existing users of the deprecated __GFP_NOFAIL are
+                * blockable, so warn of any new users that actually allow this
+                * type of allocation to fail.
+                */
+               WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
                goto nopage;
+       }
 
        /* Avoid recursion of direct reclaim */
        if (current->flags & PF_MEMALLOC)
@@ -3901,6 +3943,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
        struct page *page;
        unsigned long block_migratetype;
        int reserve;
+       int old_reserve;
 
        /*
         * Get the start pfn, end pfn and the number of blocks to reserve
@@ -3922,6 +3965,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
         * future allocation of hugepages at runtime.
         */
        reserve = min(2, reserve);
+       old_reserve = zone->nr_migrate_reserve_block;
+
+       /* When memory hot-add, we almost always need to do nothing */
+       if (reserve == old_reserve)
+               return;
+       zone->nr_migrate_reserve_block = reserve;
 
        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
                if (!pfn_valid(pfn))
@@ -3959,6 +4008,12 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                                reserve--;
                                continue;
                        }
+               } else if (!old_reserve) {
+                       /*
+                        * At boot time we don't need to scan the whole zone
+                        * for turning off MIGRATE_RESERVE.
+                        */
+                       break;
                }
 
                /*
@@ -4209,7 +4264,6 @@ static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
        int i;
-       struct pglist_data *pgdat = zone->zone_pgdat;
        size_t alloc_size;
 
        /*
@@ -4225,7 +4279,8 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 
        if (!slab_is_available()) {
                zone->wait_table = (wait_queue_head_t *)
-                       alloc_bootmem_node_nopanic(pgdat, alloc_size);
+                       memblock_virt_alloc_node_nopanic(
+                               alloc_size, zone->zone_pgdat->node_id);
        } else {
                /*
                 * This case means that a zone whose size was 0 gets new memory
@@ -4345,13 +4400,14 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
 #endif
 
 /**
- * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
+ * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
  * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
+ * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
  *
  * If an architecture guarantees that all ranges registered with
  * add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling free_bootmem() manually.
+ * this function may be used instead of calling memblock_free_early_nid()
+ * manually.
  */
 void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
 {
@@ -4363,9 +4419,9 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
                end_pfn = min(end_pfn, max_low_pfn);
 
                if (start_pfn < end_pfn)
-                       free_bootmem_node(NODE_DATA(this_nid),
-                                         PFN_PHYS(start_pfn),
-                                         (end_pfn - start_pfn) << PAGE_SHIFT);
+                       memblock_free_early_nid(PFN_PHYS(start_pfn),
+                                       (end_pfn - start_pfn) << PAGE_SHIFT,
+                                       this_nid);
        }
 }
 
@@ -4636,8 +4692,9 @@ static void __init setup_usemap(struct pglist_data *pgdat,
        unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
        zone->pageblock_flags = NULL;
        if (usemapsize)
-               zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
-                                                                  usemapsize);
+               zone->pageblock_flags =
+                       memblock_virt_alloc_node_nopanic(usemapsize,
+                                                        pgdat->node_id);
 }
 #else
 static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -4831,7 +4888,8 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
                if (!map)
-                       map = alloc_bootmem_node_nopanic(pgdat, size);
+                       map = memblock_virt_alloc_node_nopanic(size,
+                                                              pgdat->node_id);
                pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
        }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -5012,9 +5070,33 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        nodemask_t saved_node_state = node_states[N_MEMORY];
        unsigned long totalpages = early_calculate_totalpages();
        int usable_nodes = nodes_weight(node_states[N_MEMORY]);
+       struct memblock_type *type = &memblock.memory;
+
+       /* Need to find movable_zone earlier when movable_node is specified. */
+       find_usable_zone_for_movable();
 
        /*
-        * If movablecore was specified, calculate what size of
+        * If movable_node is specified, ignore kernelcore and movablecore
+        * options.
+        */
+       if (movable_node_is_enabled()) {
+               for (i = 0; i < type->cnt; i++) {
+                       if (!memblock_is_hotpluggable(&type->regions[i]))
+                               continue;
+
+                       nid = type->regions[i].nid;
+
+                       usable_startpfn = PFN_DOWN(type->regions[i].base);
+                       zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                               min(usable_startpfn, zone_movable_pfn[nid]) :
+                               usable_startpfn;
+               }
+
+               goto out2;
+       }
+
+       /*
+        * If movablecore=nn[KMG] was specified, calculate what size of
         * kernelcore that corresponds so that memory usable for
         * any allocation type is evenly spread. If both kernelcore
         * and movablecore are specified, then the value of kernelcore
@@ -5040,7 +5122,6 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                goto out;
 
        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
-       find_usable_zone_for_movable();
        usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 
 restart:
@@ -5131,6 +5212,7 @@ restart:
        if (usable_nodes && required_kernelcore > usable_nodes)
                goto restart;
 
+out2:
        /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                zone_movable_pfn[nid] =
@@ -5692,7 +5774,12 @@ module_init(init_per_zone_wmark_min)
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
-       proc_dointvec(table, write, buffer, length, ppos);
+       int rc;
+
+       rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (rc)
+               return rc;
+
        if (write) {
                user_min_free_kbytes = min_free_kbytes;
                setup_per_zone_wmarks();
@@ -5857,7 +5944,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        do {
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY)
-                       table = alloc_bootmem_nopanic(size);
+                       table = memblock_virt_alloc_nopanic(size, 0);
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
@@ -5959,7 +6046,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
        pfn = page_to_pfn(page);
        bitmap = get_pageblock_bitmap(zone, pfn);
        bitidx = pfn_to_bitidx(zone, pfn);
-       VM_BUG_ON(!zone_spans_pfn(zone, pfn));
+       VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
 
        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
                if (flags & value)
@@ -6457,12 +6544,24 @@ static void dump_page_flags(unsigned long flags)
        printk(")\n");
 }
 
-void dump_page(struct page *page)
+void dump_page_badflags(struct page *page, char *reason, unsigned long badflags)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
                page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
+       if (reason)
+               pr_alert("page dumped because: %s\n", reason);
+       if (page->flags & badflags) {
+               pr_alert("bad because of flags:\n");
+               dump_page_flags(page->flags & badflags);
+       }
        mem_cgroup_print_bad_page(page);
 }
+
+void dump_page(struct page *page, char *reason)
+{
+       dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL_GPL(dump_page);