]> Pileus Git - ~andy/linux/blobdiff - mm/huge_memory.c
ip6mr: fix rtm_family of rtnl msg
[~andy/linux] / mm / huge_memory.c
index 141dbb695097c1f0674b8978456eb1d0c98e3e67..40f17c34b4153fab93b4f1a2685dee0b8cac4da8 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/khugepaged.h>
 #include <linux/freezer.h>
 #include <linux/mman.h>
+#include <linux/pagemap.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
@@ -102,10 +103,7 @@ static int set_recommended_min_free_kbytes(void)
        unsigned long recommended_min;
        extern int min_free_kbytes;
 
-       if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-                     &transparent_hugepage_flags) &&
-           !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                     &transparent_hugepage_flags))
+       if (!khugepaged_enabled())
                return 0;
 
        for_each_populated_zone(zone)
@@ -139,12 +137,6 @@ static int start_khugepaged(void)
 {
        int err = 0;
        if (khugepaged_enabled()) {
-               int wakeup;
-               if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-               mutex_lock(&khugepaged_mutex);
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
@@ -154,16 +146,16 @@ static int start_khugepaged(void)
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                }
-               wakeup = !list_empty(&khugepaged_scan.mm_head);
-               mutex_unlock(&khugepaged_mutex);
-               if (wakeup)
+
+               if (!list_empty(&khugepaged_scan.mm_head))
                        wake_up_interruptible(&khugepaged_wait);
 
                set_recommended_min_free_kbytes();
-       } else
-               /* wakeup to exit */
-               wake_up_interruptible(&khugepaged_wait);
-out:
+       } else if (khugepaged_thread) {
+               kthread_stop(khugepaged_thread);
+               khugepaged_thread = NULL;
+       }
+
        return err;
 }
 
@@ -224,18 +216,16 @@ static ssize_t enabled_store(struct kobject *kobj,
                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
 
        if (ret > 0) {
-               int err = start_khugepaged();
+               int err;
+
+               mutex_lock(&khugepaged_mutex);
+               err = start_khugepaged();
+               mutex_unlock(&khugepaged_mutex);
+
                if (err)
                        ret = err;
        }
 
-       if (ret > 0 &&
-           (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
-                     &transparent_hugepage_flags) ||
-            test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
-                     &transparent_hugepage_flags)))
-               set_recommended_min_free_kbytes();
-
        return ret;
 }
 static struct kobj_attribute enabled_attr =
@@ -570,8 +560,6 @@ static int __init hugepage_init(void)
 
        start_khugepaged();
 
-       set_recommended_min_free_kbytes();
-
        return 0;
 out:
        hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +599,6 @@ out:
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
-static void prepare_pmd_huge_pte(pgtable_t pgtable,
-                                struct mm_struct *mm)
-{
-       assert_spin_locked(&mm->page_table_lock);
-
-       /* FIFO */
-       if (!mm->pmd_huge_pte)
-               INIT_LIST_HEAD(&pgtable->lru);
-       else
-               list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
-       mm->pmd_huge_pte = pgtable;
-}
-
 static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
        if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +640,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                 */
                page_add_new_anon_rmap(page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-               prepare_pmd_huge_pte(pgtable, mm);
+               pgtable_trans_huge_deposit(mm, pgtable);
                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm->nr_ptes++;
                spin_unlock(&mm->page_table_lock);
@@ -791,7 +766,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
-       prepare_pmd_huge_pte(pgtable, dst_mm);
+       pgtable_trans_huge_deposit(dst_mm, pgtable);
        dst_mm->nr_ptes++;
 
        ret = 0;
@@ -802,25 +777,6 @@ out:
        return ret;
 }
 
-/* no "address" argument so destroys page coloring of some arch */
-pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
-{
-       pgtable_t pgtable;
-
-       assert_spin_locked(&mm->page_table_lock);
-
-       /* FIFO */
-       pgtable = mm->pmd_huge_pte;
-       if (list_empty(&pgtable->lru))
-               mm->pmd_huge_pte = NULL;
-       else {
-               mm->pmd_huge_pte = list_entry(pgtable->lru.next,
-                                             struct page, lru);
-               list_del(&pgtable->lru);
-       }
-       return pgtable;
-}
-
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long address,
@@ -832,6 +788,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        pmd_t _pmd;
        int ret = 0, i;
        struct page **pages;
+       unsigned long mmun_start;       /* For mmu_notifiers */
+       unsigned long mmun_end;         /* For mmu_notifiers */
 
        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
                        GFP_KERNEL);
@@ -868,15 +826,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
                cond_resched();
        }
 
+       mmun_start = haddr;
+       mmun_end   = haddr + HPAGE_PMD_SIZE;
+       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_same(*pmd, orig_pmd)))
                goto out_free_pages;
        VM_BUG_ON(!PageHead(page));
 
-       pmdp_clear_flush_notify(vma, haddr, pmd);
+       pmdp_clear_flush(vma, haddr, pmd);
        /* leave pmd empty until pte is filled */
 
-       pgtable = get_pmd_huge_pte(mm);
+       pgtable = pgtable_trans_huge_withdraw(mm);
        pmd_populate(mm, &_pmd, pgtable);
 
        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +858,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        page_remove_rmap(page);
        spin_unlock(&mm->page_table_lock);
 
+       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+
        ret |= VM_FAULT_WRITE;
        put_page(page);
 
@@ -904,6 +868,7 @@ out:
 
 out_free_pages:
        spin_unlock(&mm->page_table_lock);
+       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
        mem_cgroup_uncharge_start();
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +885,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int ret = 0;
        struct page *page, *new_page;
        unsigned long haddr;
+       unsigned long mmun_start;       /* For mmu_notifiers */
+       unsigned long mmun_end;         /* For mmu_notifiers */
 
        VM_BUG_ON(!vma->anon_vma);
        spin_lock(&mm->page_table_lock);
@@ -934,7 +901,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
-                       update_mmu_cache(vma, address, entry);
+                       update_mmu_cache_pmd(vma, address, pmd);
                ret |= VM_FAULT_WRITE;
                goto out_unlock;
        }
@@ -970,38 +937,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
        __SetPageUptodate(new_page);
 
+       mmun_start = haddr;
+       mmun_end   = haddr + HPAGE_PMD_SIZE;
+       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
        spin_lock(&mm->page_table_lock);
        put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
                spin_unlock(&mm->page_table_lock);
                mem_cgroup_uncharge_page(new_page);
                put_page(new_page);
-               goto out;
+               goto out_mn;
        } else {
                pmd_t entry;
                VM_BUG_ON(!PageHead(page));
                entry = mk_pmd(new_page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                entry = pmd_mkhuge(entry);
-               pmdp_clear_flush_notify(vma, haddr, pmd);
+               pmdp_clear_flush(vma, haddr, pmd);
                page_add_new_anon_rmap(new_page, vma, haddr);
                set_pmd_at(mm, haddr, pmd, entry);
-               update_mmu_cache(vma, address, entry);
+               update_mmu_cache_pmd(vma, address, pmd);
                page_remove_rmap(page);
                put_page(page);
                ret |= VM_FAULT_WRITE;
        }
-out_unlock:
        spin_unlock(&mm->page_table_lock);
+out_mn:
+       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out:
        return ret;
+out_unlock:
+       spin_unlock(&mm->page_table_lock);
+       return ret;
 }
 
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                                   unsigned long addr,
                                   pmd_t *pmd,
                                   unsigned int flags)
 {
+       struct mm_struct *mm = vma->vm_mm;
        struct page *page = NULL;
 
        assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +1000,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
        }
+       if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+               if (page->mapping && trylock_page(page)) {
+                       lru_add_drain();
+                       if (page->mapping)
+                               mlock_vma_page(page);
+                       unlock_page(page);
+               }
+       }
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
@@ -1041,9 +1025,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
        if (__pmd_trans_huge_lock(pmd, vma) == 1) {
                struct page *page;
                pgtable_t pgtable;
-               pgtable = get_pmd_huge_pte(tlb->mm);
-               page = pmd_page(*pmd);
-               pmd_clear(pmd);
+               pmd_t orig_pmd;
+               pgtable = pgtable_trans_huge_withdraw(tlb->mm);
+               orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
+               page = pmd_page(orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                page_remove_rmap(page);
                VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1192,11 @@ static int __split_huge_page_splitting(struct page *page,
        struct mm_struct *mm = vma->vm_mm;
        pmd_t *pmd;
        int ret = 0;
+       /* For mmu_notifiers */
+       const unsigned long mmun_start = address;
+       const unsigned long mmun_end   = address + HPAGE_PMD_SIZE;
 
+       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1208,11 @@ static int __split_huge_page_splitting(struct page *page,
                 * and it won't wait on the anon_vma->root->mutex to
                 * serialize against split_huge_page*.
                 */
-               pmdp_splitting_flush_notify(vma, address, pmd);
+               pmdp_splitting_flush(vma, address, pmd);
                ret = 1;
        }
        spin_unlock(&mm->page_table_lock);
+       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
        return ret;
 }
@@ -1358,11 +1348,11 @@ static int __split_huge_page_map(struct page *page,
        pmd = page_check_address_pmd(page, mm, address,
                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
        if (pmd) {
-               pgtable = get_pmd_huge_pte(mm);
+               pgtable = pgtable_trans_huge_withdraw(mm);
                pmd_populate(mm, &_pmd, pgtable);
 
-               for (i = 0, haddr = address; i < HPAGE_PMD_NR;
-                    i++, haddr += PAGE_SIZE) {
+               haddr = address;
+               for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
                        entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1396,7 @@ static int __split_huge_page_map(struct page *page,
                 * SMP TLB and finally we write the non-huge version
                 * of the pmd entry with pmd_populate.
                 */
-               set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
-               flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+               pmdp_invalidate(vma, address, pmd);
                pmd_populate(mm, pmd, pgtable);
                ret = 1;
        }
@@ -1421,18 +1410,17 @@ static void __split_huge_page(struct page *page,
                              struct anon_vma *anon_vma)
 {
        int mapcount, mapcount2;
+       pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        struct anon_vma_chain *avc;
 
        BUG_ON(!PageHead(page));
        BUG_ON(PageTail(page));
 
        mapcount = 0;
-       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-               if (addr == -EFAULT)
-                       continue;
                mapcount += __split_huge_page_splitting(page, vma, addr);
        }
        /*
@@ -1453,12 +1441,10 @@ static void __split_huge_page(struct page *page,
        __split_huge_page_refcount(page);
 
        mapcount2 = 0;
-       list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long addr = vma_address(page, vma);
                BUG_ON(is_vma_temporary_stack(vma));
-               if (addr == -EFAULT)
-                       continue;
                mapcount2 += __split_huge_page_map(page, vma, addr);
        }
        if (mapcount != mapcount2)
@@ -1491,12 +1477,13 @@ out:
        return ret;
 }
 
-#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
-                  VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
 
 int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
 {
+       struct mm_struct *mm = vma->vm_mm;
+
        switch (advice) {
        case MADV_HUGEPAGE:
                /*
@@ -1504,6 +1491,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
                 */
                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
                        return -EINVAL;
+               if (mm->def_flags & VM_NOHUGEPAGE)
+                       return -EINVAL;
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
@@ -1655,11 +1644,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
        if (vma->vm_ops)
                /* khugepaged not yet working on file or special mappings */
                return 0;
-       /*
-        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-        * true too, verify it here.
-        */
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+       VM_BUG_ON(vma->vm_flags & VM_NO_THP);
        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = vma->vm_end & HPAGE_PMD_MASK;
        if (hstart < hend)
@@ -1833,28 +1818,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
        }
 }
 
-static void collapse_huge_page(struct mm_struct *mm,
-                              unsigned long address,
-                              struct page **hpage,
-                              struct vm_area_struct *vma,
-                              int node)
+static void khugepaged_alloc_sleep(void)
 {
-       pgd_t *pgd;
-       pud_t *pud;
-       pmd_t *pmd, _pmd;
-       pte_t *pte;
-       pgtable_t pgtable;
-       struct page *new_page;
-       spinlock_t *ptl;
-       int isolated;
-       unsigned long hstart, hend;
+       wait_event_freezable_timeout(khugepaged_wait, false,
+                       msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+}
 
-       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-#ifndef CONFIG_NUMA
-       up_read(&mm->mmap_sem);
-       VM_BUG_ON(!*hpage);
-       new_page = *hpage;
-#else
+#ifdef CONFIG_NUMA
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+       if (IS_ERR(*hpage)) {
+               if (!*wait)
+                       return false;
+
+               *wait = false;
+               *hpage = NULL;
+               khugepaged_alloc_sleep();
+       } else if (*hpage) {
+               put_page(*hpage);
+               *hpage = NULL;
+       }
+
+       return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                      struct vm_area_struct *vma, unsigned long address,
+                      int node)
+{
        VM_BUG_ON(*hpage);
        /*
         * Allocate the page while the vma is still valid and under
@@ -1866,7 +1858,7 @@ static void collapse_huge_page(struct mm_struct *mm,
         * mmap_sem in read mode is good idea also to allow greater
         * scalability.
         */
-       new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+       *hpage  = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
                                      node, __GFP_OTHER_NODE);
 
        /*
@@ -1874,20 +1866,85 @@ static void collapse_huge_page(struct mm_struct *mm,
         * preparation for taking it in write mode.
         */
        up_read(&mm->mmap_sem);
-       if (unlikely(!new_page)) {
+       if (unlikely(!*hpage)) {
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                *hpage = ERR_PTR(-ENOMEM);
-               return;
+               return NULL;
        }
-#endif
 
        count_vm_event(THP_COLLAPSE_ALLOC);
-       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
-#ifdef CONFIG_NUMA
-               put_page(new_page);
+       return *hpage;
+}
+#else
+static struct page *khugepaged_alloc_hugepage(bool *wait)
+{
+       struct page *hpage;
+
+       do {
+               hpage = alloc_hugepage(khugepaged_defrag());
+               if (!hpage) {
+                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                       if (!*wait)
+                               return NULL;
+
+                       *wait = false;
+                       khugepaged_alloc_sleep();
+               } else
+                       count_vm_event(THP_COLLAPSE_ALLOC);
+       } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+
+       return hpage;
+}
+
+static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
+{
+       if (!*hpage)
+               *hpage = khugepaged_alloc_hugepage(wait);
+
+       if (unlikely(!*hpage))
+               return false;
+
+       return true;
+}
+
+static struct page
+*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
+                      struct vm_area_struct *vma, unsigned long address,
+                      int node)
+{
+       up_read(&mm->mmap_sem);
+       VM_BUG_ON(!*hpage);
+       return  *hpage;
+}
 #endif
+
+static void collapse_huge_page(struct mm_struct *mm,
+                                  unsigned long address,
+                                  struct page **hpage,
+                                  struct vm_area_struct *vma,
+                                  int node)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd, _pmd;
+       pte_t *pte;
+       pgtable_t pgtable;
+       struct page *new_page;
+       spinlock_t *ptl;
+       int isolated;
+       unsigned long hstart, hend;
+       unsigned long mmun_start;       /* For mmu_notifiers */
+       unsigned long mmun_end;         /* For mmu_notifiers */
+
+       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+       /* release the mmap_sem read lock. */
+       new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
+       if (!new_page)
+               return;
+
+       if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
                return;
-       }
 
        /*
         * Prevent all access to pagetables with the exception of
@@ -1912,11 +1969,7 @@ static void collapse_huge_page(struct mm_struct *mm,
                goto out;
        if (is_vma_temporary_stack(vma))
                goto out;
-       /*
-        * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
-        * true too, verify it here.
-        */
-       VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+       VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
@@ -1936,6 +1989,9 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        ptl = pte_lockptr(mm, pmd);
 
+       mmun_start = address;
+       mmun_end   = address + HPAGE_PMD_SIZE;
+       mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock); /* probably unnecessary */
        /*
         * After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1999,9 @@ static void collapse_huge_page(struct mm_struct *mm,
         * huge and small TLB entries for the same virtual address
         * to avoid the risk of CPU bugs in that area.
         */
-       _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+       _pmd = pmdp_clear_flush(vma, address, pmd);
        spin_unlock(&mm->page_table_lock);
+       mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
        spin_lock(ptl);
        isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2027,6 @@ static void collapse_huge_page(struct mm_struct *mm,
        pte_unmap(pte);
        __SetPageUptodate(new_page);
        pgtable = pmd_pgtable(_pmd);
-       VM_BUG_ON(page_count(pgtable) != 1);
-       VM_BUG_ON(page_mapcount(pgtable) != 0);
 
        _pmd = mk_pmd(new_page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2043,12 @@ static void collapse_huge_page(struct mm_struct *mm,
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
        set_pmd_at(mm, address, pmd, _pmd);
-       update_mmu_cache(vma, address, _pmd);
-       prepare_pmd_huge_pte(pgtable, mm);
+       update_mmu_cache_pmd(vma, address, pmd);
+       pgtable_trans_huge_deposit(mm, pgtable);
        spin_unlock(&mm->page_table_lock);
 
-#ifndef CONFIG_NUMA
        *hpage = NULL;
-#endif
+
        khugepaged_pages_collapsed++;
 out_up_write:
        up_write(&mm->mmap_sem);
@@ -2002,9 +2056,6 @@ out_up_write:
 
 out:
        mem_cgroup_uncharge_page(new_page);
-#ifdef CONFIG_NUMA
-       put_page(new_page);
-#endif
        goto out_up_write;
 }
 
@@ -2154,12 +2205,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                        goto skip;
                if (is_vma_temporary_stack(vma))
                        goto skip;
-               /*
-                * If is_pfn_mapping() is true is_learn_pfn_mapping()
-                * must be true too, verify it here.
-                */
-               VM_BUG_ON(is_linear_pfn_mapping(vma) ||
-                         vma->vm_flags & VM_NO_THP);
+               VM_BUG_ON(vma->vm_flags & VM_NO_THP);
 
                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
                hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2280,23 @@ static int khugepaged_has_work(void)
 static int khugepaged_wait_event(void)
 {
        return !list_empty(&khugepaged_scan.mm_head) ||
-               !khugepaged_enabled();
+               kthread_should_stop();
 }
 
-static void khugepaged_do_scan(struct page **hpage)
+static void khugepaged_do_scan(void)
 {
+       struct page *hpage = NULL;
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = khugepaged_pages_to_scan;
+       bool wait = true;
 
        barrier(); /* write khugepaged_pages_to_scan to local stack */
 
        while (progress < pages) {
-               cond_resched();
-
-#ifndef CONFIG_NUMA
-               if (!*hpage) {
-                       *hpage = alloc_hugepage(khugepaged_defrag());
-                       if (unlikely(!*hpage)) {
-                               count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                               break;
-                       }
-                       count_vm_event(THP_COLLAPSE_ALLOC);
-               }
-#else
-               if (IS_ERR(*hpage))
+               if (!khugepaged_prealloc_page(&hpage, &wait))
                        break;
-#endif
+
+               cond_resched();
 
                if (unlikely(kthread_should_stop() || freezing(current)))
                        break;
@@ -2270,73 +2307,32 @@ static void khugepaged_do_scan(struct page **hpage)
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
-                                                           hpage);
+                                                           &hpage);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);
        }
-}
 
-static void khugepaged_alloc_sleep(void)
-{
-       wait_event_freezable_timeout(khugepaged_wait, false,
-                       msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
+       if (!IS_ERR_OR_NULL(hpage))
+               put_page(hpage);
 }
 
-#ifndef CONFIG_NUMA
-static struct page *khugepaged_alloc_hugepage(void)
+static void khugepaged_wait_work(void)
 {
-       struct page *hpage;
-
-       do {
-               hpage = alloc_hugepage(khugepaged_defrag());
-               if (!hpage) {
-                       count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
-                       khugepaged_alloc_sleep();
-               } else
-                       count_vm_event(THP_COLLAPSE_ALLOC);
-       } while (unlikely(!hpage) &&
-                likely(khugepaged_enabled()));
-       return hpage;
-}
-#endif
+       try_to_freeze();
 
-static void khugepaged_loop(void)
-{
-       struct page *hpage;
+       if (khugepaged_has_work()) {
+               if (!khugepaged_scan_sleep_millisecs)
+                       return;
 
-#ifdef CONFIG_NUMA
-       hpage = NULL;
-#endif
-       while (likely(khugepaged_enabled())) {
-#ifndef CONFIG_NUMA
-               hpage = khugepaged_alloc_hugepage();
-               if (unlikely(!hpage))
-                       break;
-#else
-               if (IS_ERR(hpage)) {
-                       khugepaged_alloc_sleep();
-                       hpage = NULL;
-               }
-#endif
-
-               khugepaged_do_scan(&hpage);
-#ifndef CONFIG_NUMA
-               if (hpage)
-                       put_page(hpage);
-#endif
-               try_to_freeze();
-               if (unlikely(kthread_should_stop()))
-                       break;
-               if (khugepaged_has_work()) {
-                       if (!khugepaged_scan_sleep_millisecs)
-                               continue;
-                       wait_event_freezable_timeout(khugepaged_wait, false,
-                           msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
-               } else if (khugepaged_enabled())
-                       wait_event_freezable(khugepaged_wait,
-                                            khugepaged_wait_event());
+               wait_event_freezable_timeout(khugepaged_wait,
+                                            kthread_should_stop(),
+                       msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+               return;
        }
+
+       if (khugepaged_enabled())
+               wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
 }
 
 static int khugepaged(void *none)
@@ -2346,20 +2342,9 @@ static int khugepaged(void *none)
        set_freezable();
        set_user_nice(current, 19);
 
-       /* serialize with start_khugepaged() */
-       mutex_lock(&khugepaged_mutex);
-
-       for (;;) {
-               mutex_unlock(&khugepaged_mutex);
-               VM_BUG_ON(khugepaged_thread != current);
-               khugepaged_loop();
-               VM_BUG_ON(khugepaged_thread != current);
-
-               mutex_lock(&khugepaged_mutex);
-               if (!khugepaged_enabled())
-                       break;
-               if (unlikely(kthread_should_stop()))
-                       break;
+       while (!kthread_should_stop()) {
+               khugepaged_do_scan();
+               khugepaged_wait_work();
        }
 
        spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2353,6 @@ static int khugepaged(void *none)
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
-
-       khugepaged_thread = NULL;
-       mutex_unlock(&khugepaged_mutex);
-
        return 0;
 }