mm: numa: Do not group on RO pages

[~andy/linux] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index c1c6d59b2b03cb7ae6b20d813c804f40d7b3a678..eba846bcf124fc61f390749aa0861e8d0574eba5 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
  
  #include "internal.h"
  
-#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
+#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
  #endif
  
  #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -1481,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
         if (pud_none(*pud))
                 goto no_page_table;
         if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-               BUG_ON(flags & FOLL_GET);
+               if (flags & FOLL_GET)
+                       goto out;
                 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                 goto out;
         }
@@ -1492,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
         if (pmd_none(*pmd))
                 goto no_page_table;
         if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-               BUG_ON(flags & FOLL_GET);
                 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+               if (flags & FOLL_GET) {
+                       /*
+                        * Refcount on tail pages are not well-defined and
+                        * shouldn't be taken. The caller should handle a NULL
+                        * return when trying to follow tail pages.
+                        */
+                       if (PageHead(page))
+                               get_page(page);
+                       else {
+                               page = NULL;
+                               goto out;
+                       }
+               }
                 goto out;
         }
         if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
@@ -2706,6 +2719,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 get_page(dirty_page);
  
  reuse:
+               /*
+                * Clear the pages cpupid information as the existing
+                * information potentially belongs to a now completely
+                * unrelated process.
+                */
+               if (old_page)
+                       page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
+
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
                 entry = pte_mkyoung(orig_pte);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3506,12 +3527,12 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-                               unsigned long addr, int current_nid)
+                               unsigned long addr, int page_nid)
  {
         get_page(page);
  
         count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (current_nid == numa_node_id())
+       if (page_nid == numa_node_id())
                 count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
  
         return mpol_misplaced(page, vma, addr);
@@ -3522,9 +3543,11 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
  {
         struct page *page = NULL;
         spinlock_t *ptl;
-       int current_nid = -1;
+       int page_nid = -1;
+       int last_cpupid;
         int target_nid;
         bool migrated = false;
+       int flags = 0;
  
         /*
         * The "pte" at this point cannot be used safely without
@@ -3551,28 +3574,35 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 pte_unmap_unlock(ptep, ptl);
                 return 0;
         }
+       BUG_ON(is_zero_pfn(page_to_pfn(page)));
  
-       current_nid = page_to_nid(page);
-       target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+       /*
+        * Avoid grouping on DSO/COW pages in specific and RO pages
+        * in general, RO pages shouldn't hurt as much anyway since
+        * they can be in shared cache state.
+        */
+       if (!pte_write(pte))
+               flags |= TNF_NO_GROUP;
+
+       last_cpupid = page_cpupid_last(page);
+       page_nid = page_to_nid(page);
+       target_nid = numa_migrate_prep(page, vma, addr, page_nid);
         pte_unmap_unlock(ptep, ptl);
         if (target_nid == -1) {
-               /*
-                * Account for the fault against the current node if it not
-                * being replaced regardless of where the page is located.
-                */
-               current_nid = numa_node_id();
                 put_page(page);
                 goto out;
         }
  
         /* Migrate to the requested node */
-       migrated = migrate_misplaced_page(page, target_nid);
-       if (migrated)
-               current_nid = target_nid;
+       migrated = migrate_misplaced_page(page, vma, target_nid);
+       if (migrated) {
+               page_nid = target_nid;
+               flags |= TNF_MIGRATED;
+       }
  
  out:
-       if (current_nid != -1)
-               task_numa_fault(current_nid, 1, migrated);
+       if (page_nid != -1)
+               task_numa_fault(last_cpupid, page_nid, 1, flags);
         return 0;
  }
  
@@ -3587,7 +3617,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         unsigned long offset;
         spinlock_t *ptl;
         bool numa = false;
-       int local_nid = numa_node_id();
+       int last_cpupid;
  
         spin_lock(&mm->page_table_lock);
         pmd = *pmdp;
@@ -3610,9 +3640,11 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
                 pte_t pteval = *pte;
                 struct page *page;
-               int curr_nid = local_nid;
+               int page_nid = -1;
                 int target_nid;
-               bool migrated;
+               bool migrated = false;
+               int flags = 0;
+
                 if (!pte_present(pteval))
                         continue;
                 if (!pte_numa(pteval))
@@ -3630,29 +3662,31 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 page = vm_normal_page(vma, addr, pteval);
                 if (unlikely(!page))
                         continue;
-               /* only check non-shared pages */
-               if (unlikely(page_mapcount(page) != 1))
-                       continue;
  
                 /*
-                * Note that the NUMA fault is later accounted to either
-                * the node that is currently running or where the page is
-                * migrated to.
+                * Avoid grouping on DSO/COW pages in specific and RO pages
+                * in general, RO pages shouldn't hurt as much anyway since
+                * they can be in shared cache state.
                  */
-               curr_nid = local_nid;
-               target_nid = numa_migrate_prep(page, vma, addr,
-                                              page_to_nid(page));
-               if (target_nid == -1) {
+               if (!pte_write(pteval))
+                       flags |= TNF_NO_GROUP;
+
+               last_cpupid = page_cpupid_last(page);
+               page_nid = page_to_nid(page);
+               target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+               pte_unmap_unlock(pte, ptl);
+               if (target_nid != -1) {
+                       migrated = migrate_misplaced_page(page, vma, target_nid);
+                       if (migrated) {
+                               page_nid = target_nid;
+                               flags |= TNF_MIGRATED;
+                       }
+               } else {
                         put_page(page);
-                       continue;
                 }
  
-               /* Migrate to the requested node */
-               pte_unmap_unlock(pte, ptl);
-               migrated = migrate_misplaced_page(page, target_nid);
-               if (migrated)
-                       curr_nid = target_nid;
-               task_numa_fault(curr_nid, 1, migrated);
+               if (page_nid != -1)
+                       task_numa_fault(last_cpupid, page_nid, 1, flags);
  
                 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
         }
@@ -3682,7 +3716,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
   * but allow concurrent faults), and pte mapped but not yet locked.
   * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-int handle_pte_fault(struct mm_struct *mm,
+static int handle_pte_fault(struct mm_struct *mm,
                      struct vm_area_struct *vma, unsigned long address,
                      pte_t *pte, pmd_t *pmd, unsigned int flags)
  {
@@ -3741,22 +3775,14 @@ unlock:
  /*
   * By the time we get here, we already hold the mm semaphore
   */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-               unsigned long address, unsigned int flags)
+static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                            unsigned long address, unsigned int flags)
  {
         pgd_t *pgd;
         pud_t *pud;
         pmd_t *pmd;
         pte_t *pte;
  
-       __set_current_state(TASK_RUNNING);
-
-       count_vm_event(PGFAULT);
-       mem_cgroup_count_vm_event(mm, PGFAULT);
-
-       /* do counter updates before entering really critical section. */
-       check_sync_rss_stat(current);
-
         if (unlikely(is_vm_hugetlb_page(vma)))
                 return hugetlb_fault(mm, vma, address, flags);
  
@@ -3769,9 +3795,12 @@ retry:
         if (!pmd)
                 return VM_FAULT_OOM;
         if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
+               int ret = VM_FAULT_FALLBACK;
                 if (!vma->vm_ops)
-                       return do_huge_pmd_anonymous_page(mm, vma, address,
-                                                         pmd, flags);
+                       ret = do_huge_pmd_anonymous_page(mm, vma, address,
+                                       pmd, flags);
+               if (!(ret & VM_FAULT_FALLBACK))
+                       return ret;
         } else {
                 pmd_t orig_pmd = *pmd;
                 int ret;
@@ -3837,6 +3866,37 @@ retry:
         return handle_pte_fault(mm, vma, address, pte, pmd, flags);
  }
  
+int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                   unsigned long address, unsigned int flags)
+{
+       int ret;
+
+       __set_current_state(TASK_RUNNING);
+
+       count_vm_event(PGFAULT);
+       mem_cgroup_count_vm_event(mm, PGFAULT);
+
+       /* do counter updates before entering really critical section. */
+       check_sync_rss_stat(current);
+
+       /*
+        * Enable the memcg OOM handling for faults triggered in user
+        * space.  Kernel faults are handled more gracefully.
+        */
+       if (flags & FAULT_FLAG_USER)
+               mem_cgroup_enable_oom();
+
+       ret = __handle_mm_fault(mm, vma, address, flags);
+
+       if (flags & FAULT_FLAG_USER)
+               mem_cgroup_disable_oom();
+
+       if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+               mem_cgroup_oom_synchronize();
+
+       return ret;
+}
+
  #ifndef __PAGETABLE_PUD_FOLDED
  /*
   * Allocate page upper directory.