X-Git-Url: http://pileus.org/git/?a=blobdiff_plain;f=mm%2Fhugetlb.c;h=0b7656e804d126cf0fcfa04a8b427396af86deb1;hb=9645083ca5ef365b7b750cf219bb20b61bb925f8;hp=583db19481454d2cb04eed66b34710d99d378cc1;hpb=4ef91848043679b272a1a5b8e2879acf696ba9e2;p=~andy%2Flinux diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 583db194814..0b7656e804d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -33,7 +34,6 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; /* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, + * free_huge_pages, and surplus_huge_pages. */ DEFINE_SPINLOCK(hugetlb_lock); @@ -521,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (list_empty(&h->hugepage_freelists[nid])) + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) + if (!is_migrate_isolate_page(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, + * the allocation fails. + */ + if (&h->hugepage_freelists[nid] == &page->lru) return NULL; - page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; @@ -531,6 +538,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +/* Movability of hugepages depends on migration support. */ +static inline gfp_t htlb_alloc_mask(struct hstate *h) +{ + if (hugepages_treat_as_movable || hugepage_migration_support(h)) + return GFP_HIGHUSER_MOVABLE; + else + return GFP_HIGHUSER; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -560,11 +576,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + htlb_alloc_mask(h), &mpol, &nodemask); for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { if (avoid_reserve) @@ -572,6 +588,7 @@ retry_cpuset: if (!vma_has_reserves(vma, chg)) break; + SetPagePrivate(page); h->resv_huge_pages--; break; } @@ -629,15 +646,21 @@ static void free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = (struct hugepage_subpool *)page_private(page); + bool restore_reserve; set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); + restore_reserve = PagePrivate(page); + ClearPagePrivate(page); spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); + if (restore_reserve) + h->resv_huge_pages++; + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { /* remove the page from active list */ list_del(&page->lru); @@ -673,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __SetPageHead(page); + __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwse drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; } @@ -724,7 +761,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return NULL; page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { @@ -871,6 +908,44 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, return ret; } +/* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use (including surplus) hugepages. + */ +static void dissolve_free_huge_page(struct page *page) +{ + spin_lock(&hugetlb_lock); + if (PageHuge(page) && !page_count(page)) { + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + update_and_free_page(h, page); + } + spin_unlock(&hugetlb_lock); +} + +/* + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to + * make specified memory blocks removable from the system. + * Note that start_pfn should aligned with (minimum) hugepage size. + */ +void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned int order = 8 * sizeof(void *); + unsigned long pfn; + struct hstate *h; + + /* Set scan step to minimum hugepage size */ + for_each_hstate(h) + if (order > huge_page_order(h)) + order = huge_page_order(h); + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + dissolve_free_huge_page(pfn_to_page(pfn)); +} + static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; @@ -913,12 +988,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { @@ -1115,9 +1190,9 @@ static long vma_needs_reservation(struct hstate *h, } else { long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - err = region_chg(&reservations->regions, idx, idx + 1); + err = region_chg(&resv->regions, idx, idx + 1); if (err < 0) return err; return 0; @@ -1135,10 +1210,10 @@ static void vma_commit_reservation(struct hstate *h, } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* Mark this page used in the map. */ - region_add(&reservations->regions, idx, idx + 1); + region_add(&resv->regions, idx, idx + 1); } } @@ -1164,13 +1239,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(-ENOMEM); - if (chg) - if (hugepage_subpool_get_pages(spool, chg)) + if (chg || avoid_reserve) + if (hugepage_subpool_get_pages(spool, 1)) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) { - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); @@ -1182,7 +1258,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); @@ -1198,6 +1275,20 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } +/* + * alloc_huge_page()'s wrapper which simply returns the page if allocation + * succeeds, otherwise NULL. This function is called from new_vma_page(), + * where no ERR_VALUE is expected to be returned. + */ +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct page *page = alloc_huge_page(vma, addr, avoid_reserve); + if (IS_ERR(page)) + page = NULL; + return page; +} + int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; @@ -1253,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void) #else page = virt_to_page(m); #endif - __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); + WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need @@ -2049,18 +2140,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - proc_dointvec(table, write, buffer, length, ppos); - if (hugepages_treat_as_movable) - htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; - else - htlb_alloc_mask = GFP_HIGHUSER; - return 0; -} - int hugetlb_overcommit_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -2188,7 +2267,7 @@ out: static void hugetlb_vm_op_open(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* * This new VMA should share its siblings reservation map if present. @@ -2198,34 +2277,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (reservations) - kref_get(&reservations->refs); + if (resv) + kref_get(&resv->refs); } static void resv_map_put(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - if (!reservations) + if (!resv) return; - kref_put(&reservations->refs, resv_map_release); + kref_put(&resv->refs, resv_map_release); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve; unsigned long start; unsigned long end; - if (reservations) { + if (resv) { start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - - region_count(&reservations->regions, start, end); + region_count(&resv->regions, start, end); resv_map_put(vma); @@ -2562,8 +2641,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_MAYSHARE) && - is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -2635,6 +2713,8 @@ retry_avoidcopy: spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { + ClearPagePrivate(new_page); + /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, @@ -2646,10 +2726,11 @@ retry_avoidcopy: } spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); + + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return 0; } @@ -2745,6 +2826,7 @@ retry: goto retry; goto out; } + ClearPagePrivate(page); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); @@ -2791,8 +2873,10 @@ retry: if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; - if (anon_rmap) + if (anon_rmap) { + ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) @@ -3409,3 +3493,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) return ret; } #endif + +bool isolate_huge_page(struct page *page, struct list_head *list) +{ + VM_BUG_ON(!PageHead(page)); + if (!get_page_unless_zero(page)) + return false; + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, list); + spin_unlock(&hugetlb_lock); + return true; +} + +void putback_active_hugepage(struct page *page) +{ + VM_BUG_ON(!PageHead(page)); + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + spin_unlock(&hugetlb_lock); + put_page(page); +} + +bool is_hugepage_active(struct page *page) +{ + VM_BUG_ON(!PageHuge(page)); + /* + * This function can be called for a tail page because the caller, + * scan_movable_pages, scans through a given pfn-range which typically + * covers one memory block. In systems using gigantic hugepage (1GB + * for x86_64,) a hugepage is larger than a memory block, and we don't + * support migrating such large hugepages for now, so return false + * when called for tail pages. + */ + if (PageTail(page)) + return false; + /* + * Refcount of a hwpoisoned hugepages is 1, but they are not active, + * so we should return false for them. + */ + if (unlikely(PageHWPoison(page))) + return false; + return page_count(page) > 0; +}