X-Git-Url: http://pileus.org/git/?a=blobdiff_plain;f=mm%2Fhugetlb.c;h=b49579c7f2a550462c334f97907f2fc131a65e00;hb=6caa9892c364d9689b2807bc7940eaaade872cb0;hp=b60f33080a28f0a078f8186e65ab8d878fced928;hpb=442e0973e9273ae8832abd70f52efde8b8326178;p=~andy%2Flinux diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b60f33080a2..0b7656e804d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -33,7 +34,6 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; /* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, + * free_huge_pages, and surplus_huge_pages. */ DEFINE_SPINLOCK(hugetlb_lock); @@ -135,9 +136,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * across the pages in a mapping. * * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantion_mutex. To access or modify a region the caller + * and the hugetlb_instantiation_mutex. To access or modify a region the caller * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation mutex: + * the hugetlb_instantiation_mutex: * * down_write(&mm->mmap_sem); * or @@ -434,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Decrement the reserved pages in the hugepage pool by one */ -static void decrement_hugepage_resv_vma(struct hstate *h, - struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_NORESERVE) - return; - - if (vma->vm_flags & VM_MAYSHARE) { - /* Shared mappings always use reserves */ - h->resv_huge_pages--; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - h->resv_huge_pages--; - } -} - /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { @@ -462,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma) +static int vma_has_reserves(struct vm_area_struct *vma, long chg) { + if (vma->vm_flags & VM_NORESERVE) { + /* + * This address is already reserved by other process(chg == 0), + * so, we should decrement reserved count. Without decrementing, + * reserve count remains after releasing inode, because this + * allocated page will go into page cache and is regarded as + * coming from reserved pool in releasing step. Currently, we + * don't have any other solution to deal with this situation + * properly, so add work-around here. + */ + if (vma->vm_flags & VM_MAYSHARE && chg == 0) + return 1; + else + return 0; + } + + /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) return 1; + + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; + return 0; } @@ -517,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (list_empty(&h->hugepage_freelists[nid])) + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) + if (!is_migrate_isolate_page(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, + * the allocation fails. + */ + if (&h->hugepage_freelists[nid] == &page->lru) return NULL; - page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; @@ -527,9 +538,19 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +/* Movability of hugepages depends on migration support. */ +static inline gfp_t htlb_alloc_mask(struct hstate *h) +{ + if (hugepages_treat_as_movable || hugepage_migration_support(h)) + return GFP_HIGHUSER_MOVABLE; + else + return GFP_HIGHUSER; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve) + unsigned long address, int avoid_reserve, + long chg) { struct page *page = NULL; struct mempolicy *mpol; @@ -539,16 +560,12 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct zoneref *z; unsigned int cpuset_mems_cookie; -retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma) && + if (!vma_has_reserves(vma, chg) && h->free_huge_pages - h->resv_huge_pages == 0) goto err; @@ -556,13 +573,23 @@ retry_cpuset: if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask(h), &mpol, &nodemask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); + if (avoid_reserve) + break; + if (!vma_has_reserves(vma, chg)) + break; + + SetPagePrivate(page); + h->resv_huge_pages--; break; } } @@ -574,7 +601,6 @@ retry_cpuset: return page; err: - mpol_cond_put(mpol); return NULL; } @@ -620,15 +646,21 @@ static void free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = (struct hugepage_subpool *)page_private(page); + bool restore_reserve; set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); + restore_reserve = PagePrivate(page); + ClearPagePrivate(page); spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); + if (restore_reserve) + h->resv_huge_pages++; + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { /* remove the page from active list */ list_del(&page->lru); @@ -664,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __SetPageHead(page); + __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { __SetPageTail(p); + /* + * For gigantic hugepages allocated through bootmem at + * boot, it's safer to be consistent with the not-gigantic + * hugepages and clear the PG_reserved bit from all tail pages + * too. Otherwse drivers using get_user_pages() to access tail + * pages may get the reference counting wrong if they see + * PG_reserved set on a tail page (despite the head page not + * having PG_reserved set). Enforcing this consistency between + * head and tail pages allows drivers to optimize away a check + * on the head page when they need know if put_page() is needed + * after get_user_pages(). + */ + __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; } @@ -715,7 +761,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return NULL; page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { @@ -772,33 +818,6 @@ static int hstate_next_node_to_alloc(struct hstate *h, return nid; } -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) -{ - struct page *page; - int start_nid; - int next_nid; - int ret = 0; - - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - next_nid = start_nid; - - do { - page = alloc_fresh_huge_page_node(h, next_nid); - if (page) { - ret = 1; - break; - } - next_nid = hstate_next_node_to_alloc(h, nodes_allowed); - } while (next_nid != start_nid); - - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - - return ret; -} - /* * helper for free_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the @@ -817,6 +836,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +{ + struct page *page; + int nr_nodes, node; + int ret = 0; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_huge_page_node(h, node); + if (page) { + ret = 1; + break; + } + } + + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return ret; +} + /* * Free huge page from pool from next node to free. * Attempt to keep persistent huge pages more or less @@ -826,40 +879,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { - int start_nid; - int next_nid; + int nr_nodes, node; int ret = 0; - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ - if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && - !list_empty(&h->hugepage_freelists[next_nid])) { + if ((!acct_surplus || h->surplus_huge_pages_node[node]) && + !list_empty(&h->hugepage_freelists[node])) { struct page *page = - list_entry(h->hugepage_freelists[next_nid].next, + list_entry(h->hugepage_freelists[node].next, struct page, lru); list_del(&page->lru); h->free_huge_pages--; - h->free_huge_pages_node[next_nid]--; + h->free_huge_pages_node[node]--; if (acct_surplus) { h->surplus_huge_pages--; - h->surplus_huge_pages_node[next_nid]--; + h->surplus_huge_pages_node[node]--; } update_and_free_page(h, page); ret = 1; break; } - next_nid = hstate_next_node_to_free(h, nodes_allowed); - } while (next_nid != start_nid); + } return ret; } +/* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use (including surplus) hugepages. + */ +static void dissolve_free_huge_page(struct page *page) +{ + spin_lock(&hugetlb_lock); + if (PageHuge(page) && !page_count(page)) { + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + update_and_free_page(h, page); + } + spin_unlock(&hugetlb_lock); +} + +/* + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to + * make specified memory blocks removable from the system. + * Note that start_pfn should aligned with (minimum) hugepage size. + */ +void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned int order = 8 * sizeof(void *); + unsigned long pfn; + struct hstate *h; + + /* Set scan step to minimum hugepage size */ + for_each_hstate(h) + if (order > huge_page_order(h)) + order = huge_page_order(h); + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + dissolve_free_huge_page(pfn_to_page(pfn)); +} + static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; @@ -902,12 +988,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { @@ -944,10 +1030,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { - struct page *page; + struct page *page = NULL; spin_lock(&hugetlb_lock); - page = dequeue_huge_page_node(h, nid); + if (h->free_huge_pages - h->resv_huge_pages > 0) + page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); if (!page) @@ -1035,11 +1122,8 @@ free: spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ - if (!list_empty(&surplus_list)) { - list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - put_page(page); - } - } + list_for_each_entry_safe(page, tmp, &surplus_list, lru) + put_page(page); spin_lock(&hugetlb_lock); return ret; @@ -1106,9 +1190,9 @@ static long vma_needs_reservation(struct hstate *h, } else { long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - err = region_chg(&reservations->regions, idx, idx + 1); + err = region_chg(&resv->regions, idx, idx + 1); if (err < 0) return err; return 0; @@ -1126,10 +1210,10 @@ static void vma_commit_reservation(struct hstate *h, } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* Mark this page used in the map. */ - region_add(&reservations->regions, idx, idx + 1); + region_add(&resv->regions, idx, idx + 1); } } @@ -1155,38 +1239,35 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(-ENOMEM); - if (chg) - if (hugepage_subpool_get_pages(spool, chg)) + if (chg || avoid_reserve) + if (hugepage_subpool_get_pages(spool, 1)) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) { - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); - if (page) { - /* update page cgroup details */ - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), - h_cg, page); - spin_unlock(&hugetlb_lock); - } else { + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); + if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), - h_cg, page); list_move(&page->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); + /* Fall through */ } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + spin_unlock(&hugetlb_lock); set_page_private(page, (unsigned long)spool); @@ -1194,17 +1275,29 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } +/* + * alloc_huge_page()'s wrapper which simply returns the page if allocation + * succeeds, otherwise NULL. This function is called from new_vma_page(), + * where no ERR_VALUE is expected to be returned. + */ +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct page *page = alloc_huge_page(vma, addr, avoid_reserve); + if (IS_ERR(page)) + page = NULL; + return page; +} + int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_states[N_MEMORY]); + int nr_nodes, node; - while (nr_nodes) { + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = __alloc_bootmem_node_nopanic( - NODE_DATA(hstate_next_node_to_alloc(h, - &node_states[N_MEMORY])), + addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), huge_page_size(h), huge_page_size(h), 0); if (addr) { @@ -1216,7 +1309,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - nr_nodes--; } return 0; @@ -1252,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void) #else page = virt_to_page(m); #endif - __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); + WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); /* * If we had gigantic hugepages allocated at boot time, we need @@ -1355,48 +1447,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count, static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { - int start_nid, next_nid; - int ret = 0; + int nr_nodes, node; VM_BUG_ON(delta != -1 && delta != 1); - if (delta < 0) - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - else - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { - int nid = next_nid; - if (delta < 0) { - /* - * To shrink on this node, there must be a surplus page - */ - if (!h->surplus_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_alloc(h, - nodes_allowed); - continue; - } + if (delta < 0) { + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node]) + goto found; } - if (delta > 0) { - /* - * Surplus cannot exceed the total number of pages - */ - if (h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_free(h, - nodes_allowed); - continue; - } + } else { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node] < + h->nr_huge_pages_node[node]) + goto found; } + } + return 0; - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (next_nid != start_nid); - - return ret; +found: + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[node] += delta; + return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) @@ -1526,7 +1598,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = strict_strtoul(buf, 10, &count); + err = kstrtoul(buf, 10, &count); if (err) goto out; @@ -1617,7 +1689,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (h->order >= MAX_ORDER) return -EINVAL; - err = strict_strtoul(buf, 10, &input); + err = kstrtoul(buf, 10, &input); if (err) return err; @@ -2068,18 +2140,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - proc_dointvec(table, write, buffer, length, ppos); - if (hugepages_treat_as_movable) - htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; - else - htlb_alloc_mask = GFP_HIGHUSER; - return 0; -} - int hugetlb_overcommit_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -2207,7 +2267,7 @@ out: static void hugetlb_vm_op_open(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* * This new VMA should share its siblings reservation map if present. @@ -2217,34 +2277,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (reservations) - kref_get(&reservations->refs); + if (resv) + kref_get(&resv->refs); } static void resv_map_put(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - if (!reservations) + if (!resv) return; - kref_put(&reservations->refs, resv_map_release); + kref_put(&resv->refs, resv_map_release); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve; unsigned long start; unsigned long end; - if (reservations) { + if (resv) { start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - - region_count(&reservations->regions, start, end); + region_count(&resv->regions, start, end); resv_map_put(vma); @@ -2557,7 +2617,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int avoidcopy; int outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2567,10 +2626,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_mapcount(old_page) == 1); - if (avoidcopy) { - if (PageAnon(old_page)) - page_move_anon_rmap(old_page, vma, address); + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -2584,8 +2641,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_MAYSHARE) && - is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -2657,6 +2713,8 @@ retry_avoidcopy: spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { + ClearPagePrivate(new_page); + /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, @@ -2668,10 +2726,11 @@ retry_avoidcopy: } spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); + + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return 0; } @@ -2767,6 +2826,7 @@ retry: goto retry; goto out; } + ClearPagePrivate(page); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); @@ -2813,8 +2873,10 @@ retry: if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; - if (anon_rmap) + if (anon_rmap) { + ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) @@ -3431,3 +3493,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) return ret; } #endif + +bool isolate_huge_page(struct page *page, struct list_head *list) +{ + VM_BUG_ON(!PageHead(page)); + if (!get_page_unless_zero(page)) + return false; + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, list); + spin_unlock(&hugetlb_lock); + return true; +} + +void putback_active_hugepage(struct page *page) +{ + VM_BUG_ON(!PageHead(page)); + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + spin_unlock(&hugetlb_lock); + put_page(page); +} + +bool is_hugepage_active(struct page *page) +{ + VM_BUG_ON(!PageHuge(page)); + /* + * This function can be called for a tail page because the caller, + * scan_movable_pages, scans through a given pfn-range which typically + * covers one memory block. In systems using gigantic hugepage (1GB + * for x86_64,) a hugepage is larger than a memory block, and we don't + * support migrating such large hugepages for now, so return false + * when called for tail pages. + */ + if (PageTail(page)) + return false; + /* + * Refcount of a hwpoisoned hugepages is 1, but they are not active, + * so we should return false for them. + */ + if (unlikely(PageHWPoison(page))) + return false; + return page_count(page) > 0; +}