[PATCH] page_alloc.c: buddy handling cleanup

[~andy/linux] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index e197818a7cf660d30e9e7e934871047db7c456c2..123c605867404ecfc00ef0d19a8a65d852ed0be0 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,9 +49,9 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
  EXPORT_SYMBOL(node_online_map);
  nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
  EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list __read_mostly;
  unsigned long totalram_pages __read_mostly;
  unsigned long totalhigh_pages __read_mostly;
+unsigned long totalreserve_pages __read_mostly;
  long nr_swap_pages;
  int percpu_pagelist_fraction;
  
@@ -152,7 +152,8 @@ static void bad_page(struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback );
+                       1 << PG_writeback |
+                       1 << PG_buddy );
         set_page_count(page, 0);
         reset_page_mapcount(page);
         page->mapping = NULL;
@@ -212,23 +213,39 @@ static void destroy_compound_page(struct page *page, unsigned long order)
         }
  }
  
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+       int i;
+
+       BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+       /*
+        * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+        * and __GFP_HIGHMEM from hard or soft interrupt context.
+        */
+       BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+       for (i = 0; i < (1 << order); i++)
+               clear_highpage(page + i);
+}
+
  /*
   * function for dealing with page's order in buddy system.
   * zone->lock is already acquired when we use these.
   * So, we don't need atomic page->flags operations here.
   */
-static inline unsigned long page_order(struct page *page) {
+static inline unsigned long page_order(struct page *page)
+{
         return page_private(page);
  }
  
-static inline void set_page_order(struct page *page, int order) {
+static inline void set_page_order(struct page *page, int order)
+{
         set_page_private(page, order);
-       __SetPagePrivate(page);
+       __SetPageBuddy(page);
  }
  
  static inline void rmv_page_order(struct page *page)
  {
-       __ClearPagePrivate(page);
+       __ClearPageBuddy(page);
         set_page_private(page, 0);
  }
  
@@ -267,11 +284,13 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
   * This function checks whether a page is free && is the buddy
   * we can do coalesce a page and its buddy if
   * (a) the buddy is not in a hole &&
- * (b) the buddy is free &&
- * (c) the buddy is on the buddy system &&
- * (d) a page and its buddy have the same order.
- * for recording page's order, we use page_private(page) and PG_private.
+ * (b) the buddy is in the buddy system &&
+ * (c) a page and its buddy have the same order.
   *
+ * For recording whether a page is in the buddy system, we use PG_buddy.
+ * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ *
+ * For recording page's order, we use page_private(page).
   */
  static inline int page_is_buddy(struct page *page, int order)
  {
@@ -280,11 +299,11 @@ static inline int page_is_buddy(struct page *page, int order)
                 return 0;
  #endif
  
-       if (PagePrivate(page)           &&
-           (page_order(page) == order) &&
-            page_count(page) == 0)
-               return 1;
-       return 0;
+       if (PageBuddy(page) && page_order(page) == order) {
+               BUG_ON(page_count(page) != 0);
+               return 1;
+       }
+       return 0;
  }
  
  /*
@@ -300,7 +319,7 @@ static inline int page_is_buddy(struct page *page, int order)
   * as necessary, plus some accounting needed to play nicely with other
   * parts of the VM system.
   * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_Private.Page's
+ * free pages of length of (1 << order) and marked with PG_buddy. Page's
   * order is recorded in page_private(page) field.
   * So when we are allocating or freeing one, we can derive the state of the
   * other.  That is, if we allocate a small block, and both were   
@@ -363,7 +382,8 @@ static inline int free_pages_check(struct page *page)
                         1 << PG_slab    |
                         1 << PG_swapcache |
                         1 << PG_writeback |
-                       1 << PG_reserved ))))
+                       1 << PG_reserved |
+                       1 << PG_buddy ))))
                 bad_page(page);
         if (PageDirty(page))
                 __ClearPageDirty(page);
@@ -442,7 +462,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
         if (order == 0) {
                 __ClearPageReserved(page);
                 set_page_count(page, 0);
-               set_page_refs(page, 0);
+               set_page_refcounted(page);
                 __free_page(page);
         } else {
                 int loop;
@@ -457,7 +477,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
                         set_page_count(p, 0);
                 }
  
-               set_page_refs(page, order);
+               set_page_refcounted(page);
                 __free_pages(page, order);
         }
  }
@@ -496,7 +516,7 @@ static inline void expand(struct zone *zone, struct page *page,
  /*
   * This page is about to be returned from the page allocator
   */
-static int prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
  {
         if (unlikely(page_mapcount(page) |
                 (page->mapping != NULL)  |
@@ -511,7 +531,8 @@ static int prep_new_page(struct page *page, int order)
                         1 << PG_slab    |
                         1 << PG_swapcache |
                         1 << PG_writeback |
-                       1 << PG_reserved ))))
+                       1 << PG_reserved |
+                       1 << PG_buddy ))))
                 bad_page(page);
  
         /*
@@ -525,8 +546,15 @@ static int prep_new_page(struct page *page, int order)
                         1 << PG_referenced | 1 << PG_arch_1 |
                         1 << PG_checked | 1 << PG_mappedtodisk);
         set_page_private(page, 0);
-       set_page_refs(page, order);
+       set_page_refcounted(page);
         kernel_map_pages(page, 1 << order, 1);
+
+       if (gfp_flags & __GFP_ZERO)
+               prep_zero_page(page, order, gfp_flags);
+
+       if (order && (gfp_flags & __GFP_COMP))
+               prep_compound_page(page, order);
+
         return 0;
  }
  
@@ -582,13 +610,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  /*
   * Called from the slab reaper to drain pagesets on a particular node that
   * belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
   */
  void drain_node_pages(int nodeid)
  {
         int i, z;
         unsigned long flags;
  
-       local_irq_save(flags);
         for (z = 0; z < MAX_NR_ZONES; z++) {
                 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
                 struct per_cpu_pageset *pset;
@@ -598,11 +627,14 @@ void drain_node_pages(int nodeid)
                         struct per_cpu_pages *pcp;
  
                         pcp = &pset->pcp[i];
-                       free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                       pcp->count = 0;
+                       if (pcp->count) {
+                               local_irq_save(flags);
+                               free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                               pcp->count = 0;
+                               local_irq_restore(flags);
+                       }
                 }
         }
-       local_irq_restore(flags);
  }
  #endif
  
@@ -732,15 +764,6 @@ void fastcall free_cold_page(struct page *page)
         free_hot_cold_page(page, 1);
  }
  
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
-{
-       int i;
-
-       BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
-       for(i = 0; i < (1 << order); i++)
-               clear_highpage(page + i);
-}
-
  /*
   * split_page takes a non-compound higher-order page, and splits it into
   * n (1<<order) sub-pages: page[0..n]
@@ -755,10 +778,8 @@ void split_page(struct page *page, unsigned int order)
  
         BUG_ON(PageCompound(page));
         BUG_ON(!page_count(page));
-       for (i = 1; i < (1 << order); i++) {
-               BUG_ON(page_count(page + i));
-               set_page_count(page + i, 1);
-       }
+       for (i = 1; i < (1 << order); i++)
+               set_page_refcounted(page + i);
  }
  
  /*
@@ -804,14 +825,8 @@ again:
         put_cpu();
  
         BUG_ON(bad_range(zone, page));
-       if (prep_new_page(page, order))
+       if (prep_new_page(page, order, gfp_flags))
                 goto again;
-
-       if (gfp_flags & __GFP_ZERO)
-               prep_zero_page(page, order, gfp_flags);
-
-       if (order && (gfp_flags & __GFP_COMP))
-               prep_compound_page(page, order);
         return page;
  
  failed:
@@ -935,7 +950,8 @@ restart:
                 goto got_pg;
  
         do {
-               wakeup_kswapd(*z, order);
+               if (cpuset_zone_allowed(*z, gfp_mask))
+                       wakeup_kswapd(*z, order);
         } while (*(++z));
  
         /*
@@ -1192,7 +1208,7 @@ unsigned int nr_free_highpages (void)
         pg_data_t *pgdat;
         unsigned int pages = 0;
  
-       for_each_pgdat(pgdat)
+       for_each_online_pgdat(pgdat)
                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
  
         return pages;
@@ -1334,7 +1350,7 @@ void get_zone_counts(unsigned long *active,
         *active = 0;
         *inactive = 0;
         *free = 0;
-       for_each_pgdat(pgdat) {
+       for_each_online_pgdat(pgdat) {
                 unsigned long l, m, n;
                 __get_zone_counts(&l, &m, &n, pgdat);
                 *active += l;
@@ -1771,7 +1787,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                         continue;
                 page = pfn_to_page(pfn);
                 set_page_links(page, zone, nid, pfn);
-               set_page_count(page, 1);
+               init_page_count(page);
                 reset_page_mapcount(page);
                 SetPageReserved(page);
                 INIT_LIST_HEAD(&page->lru);
@@ -2020,8 +2036,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
                 setup_pageset(zone_pcp(zone,cpu), batch);
  #endif
         }
-       printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-               zone->name, zone->present_pages, batch);
+       if (zone->present_pages)
+               printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+                       zone->name, zone->present_pages, batch);
  }
  
  static __meminit void init_currently_empty_zone(struct zone *zone,
@@ -2032,7 +2049,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
         zone_wait_table_init(zone, size);
         pgdat->nr_zones = zone_idx(zone) + 1;
  
-       zone->zone_mem_map = pfn_to_page(zone_start_pfn);
         zone->zone_start_pfn = zone_start_pfn;
  
         memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
@@ -2160,8 +2176,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos)
  {
         pg_data_t *pgdat;
         loff_t node = *pos;
-
-       for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+       for (pgdat = first_online_pgdat();
+            pgdat && node;
+            pgdat = next_online_pgdat(pgdat))
                 --node;
  
         return pgdat;
@@ -2172,7 +2189,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
         pg_data_t *pgdat = (pg_data_t *)arg;
  
         (*pos)++;
-       return pgdat->pgdat_next;
+       return next_online_pgdat(pgdat);
  }
  
  static void frag_stop(struct seq_file *m, void *arg)
@@ -2462,6 +2479,38 @@ void __init page_alloc_init(void)
         hotcpu_notifier(page_alloc_cpu_notify, 0);
  }
  
+/*
+ * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ *     or min_free_kbytes changes.
+ */
+static void calculate_totalreserve_pages(void)
+{
+       struct pglist_data *pgdat;
+       unsigned long reserve_pages = 0;
+       int i, j;
+
+       for_each_online_pgdat(pgdat) {
+               for (i = 0; i < MAX_NR_ZONES; i++) {
+                       struct zone *zone = pgdat->node_zones + i;
+                       unsigned long max = 0;
+
+                       /* Find valid and maximum lowmem_reserve in the zone */
+                       for (j = i; j < MAX_NR_ZONES; j++) {
+                               if (zone->lowmem_reserve[j] > max)
+                                       max = zone->lowmem_reserve[j];
+                       }
+
+                       /* we treat pages_high as reserved pages. */
+                       max += zone->pages_high;
+
+                       if (max > zone->present_pages)
+                               max = zone->present_pages;
+                       reserve_pages += max;
+               }
+       }
+       totalreserve_pages = reserve_pages;
+}
+
  /*
   * setup_per_zone_lowmem_reserve - called whenever
   *     sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
@@ -2473,7 +2522,7 @@ static void setup_per_zone_lowmem_reserve(void)
         struct pglist_data *pgdat;
         int j, idx;
  
-       for_each_pgdat(pgdat) {
+       for_each_online_pgdat(pgdat) {
                 for (j = 0; j < MAX_NR_ZONES; j++) {
                         struct zone *zone = pgdat->node_zones + j;
                         unsigned long present_pages = zone->present_pages;
@@ -2493,6 +2542,9 @@ static void setup_per_zone_lowmem_reserve(void)
                         }
                 }
         }
+
+       /* update totalreserve_pages */
+       calculate_totalreserve_pages();
  }
  
  /*
@@ -2547,6 +2599,9 @@ void setup_per_zone_pages_min(void)
                 zone->pages_high  = zone->pages_min + tmp / 2;
                 spin_unlock_irqrestore(&zone->lru_lock, flags);
         }
+
+       /* update totalreserve_pages */
+       calculate_totalreserve_pages();
  }
  
  /*
@@ -2692,8 +2747,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                 else
                         numentries <<= (PAGE_SHIFT - scale);
         }
-       /* rounded up to nearest power of 2 in size */
-       numentries = 1UL << (long_log2(numentries) + 1);
+       numentries = roundup_pow_of_two(numentries);
  
         /* limit allocation size to 1/16 total memory by default */
         if (max == 0) {
@@ -2736,3 +2790,44 @@ void *__init alloc_large_system_hash(const char *tablename,
  
         return table;
  }
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+/*
+ * pfn <-> page translation. out-of-line version.
+ * (see asm-generic/memory_model.h)
+ */
+#if defined(CONFIG_FLATMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+       return mem_map + (pfn - ARCH_PFN_OFFSET);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+       return (page - mem_map) + ARCH_PFN_OFFSET;
+}
+#elif defined(CONFIG_DISCONTIGMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+       int nid = arch_pfn_to_nid(pfn);
+       return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+       struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+       return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
+}
+#elif defined(CONFIG_SPARSEMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+       return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+}
+
+unsigned long page_to_pfn(struct page *page)
+{
+       long section_id = page_to_section(page);
+       return page - __section_mem_map_addr(__nr_to_section(section_id));
+}
+#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */