X-Git-Url: http://pileus.org/git/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=0ee638f76ebe584cd40d7541bac886b96b03162e;hb=5c8096439600de88a7ceded70cc86b817d4a548e;hp=c2b59dbda196dcaaffcd44f367143e3e83a15c0f;hpb=fa1586a7e43760f0e25e72b2e3f97ee18b2be967;p=~andy%2Flinux diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2b59dbda19..0ee638f76eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -488,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount -2. - * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. + * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is + * serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -527,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount -2. Page's - * order is recorded in page_private(page) field. + * free pages of length of (1 << order) and marked with _mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) + * field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. @@ -647,7 +651,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, int to_free = count; spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (to_free) { @@ -696,7 +699,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); @@ -721,7 +723,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) return false; if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page),PAGE_SIZE<managed_pages += 1 << order; + page_zone(page)->managed_pages += nr_pages; set_page_refcounted(page); __free_pages(page, order); } @@ -885,7 +888,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { unsigned int current_order; - struct free_area * area; + struct free_area *area; struct page *page; /* Find a page of the appropriate size in the preferred list */ @@ -1007,14 +1010,60 @@ static void change_pageblock_range(struct page *pageblock_page, } } +/* + * If breaking a large block of pages, move all free pages to the preferred + * allocation list. If falling back for a reclaimable kernel allocation, be + * more aggressive about taking ownership of free pages. + * + * On the other hand, never change migration type of MIGRATE_CMA pageblocks + * nor move CMA pages to different free lists. We don't want unmovable pages + * to be allocated from MIGRATE_CMA areas. + * + * Returns the new migratetype of the pageblock (or the same old migratetype + * if it was unchanged). + */ +static int try_to_steal_freepages(struct zone *zone, struct page *page, + int start_type, int fallback_type) +{ + int current_order = page_order(page); + + if (is_migrate_cma(fallback_type)) + return fallback_type; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { + change_pageblock_range(page, current_order, start_type); + return start_type; + } + + if (current_order >= pageblock_order / 2 || + start_type == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { + int pages; + + pages = move_freepages_block(zone, page, start_type); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) { + + set_pageblock_migratetype(page, start_type); + return start_type; + } + + } + + return fallback_type; +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { - struct free_area * area; + struct free_area *area; int current_order; struct page *page; - int migratetype, i; + int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; @@ -1034,51 +1083,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) struct page, lru); area->nr_free--; - /* - * If breaking a large block of pages, move all free - * pages to the preferred allocation list. If falling - * back for a reclaimable kernel allocation, be more - * aggressive about taking ownership of free pages - * - * On the other hand, never change migration - * type of MIGRATE_CMA pageblocks nor move CMA - * pages on different free lists. We don't - * want unmovable pages to be allocated from - * MIGRATE_CMA areas. - */ - if (!is_migrate_cma(migratetype) && - (current_order >= pageblock_order / 2 || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled)) { - int pages; - pages = move_freepages_block(zone, page, - start_migratetype); - - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, - start_migratetype); - - migratetype = start_migratetype; - } + new_type = try_to_steal_freepages(zone, page, + start_migratetype, + migratetype); /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order && - !is_migrate_cma(migratetype)) - change_pageblock_range(page, current_order, - start_migratetype); - + /* + * Borrow the excess buddy pages as well, irrespective + * of whether we stole freepages, or took ownership of + * the pageblock or not. + * + * Exception: When borrowing from MIGRATE_CMA, release + * the excess buddy pages to CMA itself. + */ expand(zone, page, order, current_order, area, is_migrate_cma(migratetype) ? migratetype : start_migratetype); - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype); + trace_mm_page_alloc_extfrag(page, order, + current_order, start_migratetype, migratetype, + new_type == start_migratetype); return page; } @@ -1281,7 +1308,7 @@ void mark_free_pages(struct zone *zone) int order, t; struct list_head *curr; - if (!zone->spanned_pages) + if (zone_is_empty(zone)) return; spin_lock_irqsave(&zone->lock, flags); @@ -1526,6 +1553,7 @@ again: get_pageblock_migratetype(page)); } + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -1792,6 +1820,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); } +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; +} + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); @@ -1829,6 +1862,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) { } +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return true; +} + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; @@ -1860,16 +1898,41 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { + unsigned long mark; + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) + goto try_this_zone; + /* + * Distribute pages in proportion to the individual + * zone size to ensure fair page aging. The zone a + * page was allocated in should have no effect on the + * time the page has in memory before being reclaimed. + * + * When zone_reclaim_mode is enabled, try to stay in + * local zones in the fastpath. If that fails, the + * slowpath is entered, which will do another pass + * starting with the local zones, but ultimately fall + * back to remote zones that do not partake in the + * fairness round-robin cycle of this zonelist. + */ + if (alloc_flags & ALLOC_WMARK_LOW) { + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; + if (zone_reclaim_mode && + !zone_local(preferred_zone, zone)) + continue; + } /* * When allocating a page cache page for writing, we * want to get it from a zone that is within its dirty @@ -1900,16 +1963,11 @@ zonelist_scan: (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) goto this_zone_full; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { - unsigned long mark; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { int ret; - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) - goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* @@ -2321,16 +2379,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static inline -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx, - enum zone_type classzone_idx) +static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order, classzone_idx); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); + /* + * Only reset the batches of zones that were actually + * considered in the fast path, we don't want to + * thrash fairness information for zones that are not + * actually part of this zonelist's round-robin cycle. + */ + if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) + continue; + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - + low_wmark_pages(zone) - + zone_page_state(zone, NR_ALLOC_BATCH)); + } } static inline int @@ -2426,9 +2498,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + prepare_slowpath(gfp_mask, order, zonelist, + high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -3095,7 +3166,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, - (zone->all_unreclaimable ? "yes" : "no") + (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -3104,7 +3175,7 @@ void show_free_areas(unsigned int filter) } for_each_populated_zone(zone) { - unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned long nr[MAX_ORDER], flags, order, total = 0; unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) @@ -3416,11 +3487,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) static int default_zonelist_order(void) { int nid, zone_type; - unsigned long low_kmem_size,total_size; + unsigned long low_kmem_size, total_size; struct zone *z; int average_size; /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the system. + * ZONE_DMA and ZONE_DMA32 can be very small area in the system. * If they are really small and used heavily, the system can fall * into OOM very easily. * This function detect ZONE_DMA/DMA32 size and configures zone order. @@ -3452,9 +3523,9 @@ static int default_zonelist_order(void) return ZONELIST_ORDER_NODE; /* * look into each node's config. - * If there is a node whose DMA/DMA32 memory is very big area on - * local memory, NODE_ORDER may be suitable. - */ + * If there is a node whose DMA/DMA32 memory is very big area on + * local memory, NODE_ORDER may be suitable. + */ average_size = total_size / (nodes_weight(node_states[N_MEMORY]) + 1); for_each_online_node(nid) { @@ -4180,7 +4251,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) if (!zone->wait_table) return -ENOMEM; - for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) + for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); return 0; @@ -4237,7 +4308,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int nid; /* * NOTE: The following SMP-unsafe globals are only used early in boot * when the kernel is running single-threaded. @@ -4248,15 +4319,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) if (last_start_pfn <= pfn && pfn < last_end_pfn) return last_nid; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - if (start_pfn <= pfn && pfn < end_pfn) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; - return nid; - } - /* This is a memory hole */ - return -1; + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != -1) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; + } + + return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ @@ -4586,7 +4656,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __init set_pageblock_order(void) +void __paginginit set_pageblock_order(void) { unsigned int order; @@ -4614,7 +4684,7 @@ void __init set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __init set_pageblock_order(void) +void __paginginit set_pageblock_order(void) { } @@ -4728,8 +4798,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone_pcp_init(zone); + + /* For bootup, initialized properly in watermark setup */ + mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); + lruvec_init(&zone->lruvec); if (!size) continue; @@ -4930,7 +5003,7 @@ static unsigned long __init early_calculate_totalpages(void) if (pages) node_set_state(nid, N_MEMORY); } - return totalpages; + return totalpages; } /* @@ -5047,7 +5120,7 @@ restart: /* * Some kernelcore has been met, update counts and * break if the kernelcore for this node has been - * satisified + * satisfied */ required_kernelcore -= min(required_kernelcore, size_pages); @@ -5061,7 +5134,7 @@ restart: * If there is still required_kernelcore, we do another pass with one * less node in the count. This will push zone_movable_pfn[nid] further * along on the nodes that still have memory until kernelcore is - * satisified + * satisfied */ usable_nodes--; if (usable_nodes && required_kernelcore > usable_nodes) @@ -5286,8 +5359,10 @@ void __init mem_init_print_info(const char *str) * 3) .rodata.* may be embedded into .text or .data sections. */ #define adj_init_size(start, end, size, pos, adj) \ - if (start <= pos && pos < end && size > adj) \ - size -= adj; + do { \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; \ + } while (0) adj_init_size(__init_begin, __init_end, init_data_size, _sinittext, init_code_size); @@ -5361,7 +5436,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, * This is only okay since the processor is dead and cannot * race with what we are doing. */ - refresh_cpu_vm_stats(cpu); + cpu_vm_stats_fold(cpu); } return NOTIFY_OK; } @@ -5498,6 +5573,11 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - + low_wmark_pages(zone) - + zone_page_state(zone, NR_ALLOC_BATCH)); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5570,7 +5650,7 @@ static void __meminit setup_per_zone_inactive_ratio(void) * we want it large (64MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * - * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields @@ -5614,11 +5694,11 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); @@ -5682,8 +5762,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist - * can have before it gets flushed back to buddy allocator. + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -5745,9 +5825,10 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries += (1UL << (20 - PAGE_SHIFT)) - 1; - numentries >>= 20 - PAGE_SHIFT; - numentries <<= 20 - PAGE_SHIFT; + + /* It isn't necessary when PAGE_SIZE >= 1MB */ + if (PAGE_SHIFT < 20) + numentries = round_up(numentries, (1<<20)/PAGE_SIZE); /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) @@ -5900,7 +5981,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages * - * PageLRU check wihtout isolation or lru_lock could race so that + * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. It means you can't * expect this function should be exact. */ @@ -5928,6 +6009,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, continue; page = pfn_to_page(check); + + /* + * Hugepages are not in LRU lists, but they're movable. + * We need not scan over tail pages bacause we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page)) { + iter = round_up(iter + 1, 1<