shmem: replace page if mapping excludes its zone

author Hugh Dickins <hughd@google.com>

Tue, 29 May 2012 22:06:38 +0000 (15:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 29 May 2012 23:22:22 +0000 (16:22 -0700)
author Hugh Dickins <hughd@google.com>
Tue, 29 May 2012 22:06:38 +0000 (15:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 29 May 2012 23:22:22 +0000 (16:22 -0700)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index bc3073ce95cc521bb1ca219b8a7d32a51087d813..d965c4bfab3ae447cb93be2c24cf653dd5948de5 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -351,6 +351,7 @@ extern int swap_type_of(dev_t, sector_t, struct block_device **);
  extern unsigned int count_swap_pages(int, int);
  extern sector_t map_swap_page(struct page *, struct block_device **);
  extern sector_t swapdev_block(int, pgoff_t);
+extern int page_swapcount(struct page *);
  extern int reuse_swap_page(struct page *);
  extern int try_to_free_swap(struct page *);
  struct backing_dev_info;
@@ -445,6 +446,11 @@ static inline void delete_from_swap_cache(struct page *page)
  {
  }
  
+static inline int page_swapcount(struct page *page)
+{
+       return 0;
+}
+
  #define reuse_swap_page(page)  (page_mapcount(page) == 1)
  
  static inline int try_to_free_swap(struct page *page)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 4f71219cc53e955bc78612de37b94abce81f6aed..d7ce417cae7c9b6b73f0001ee3fa938b3c73419d 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3373,7 +3373,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
  void mem_cgroup_replace_page_cache(struct page *oldpage,
                                   struct page *newpage)
  {
-       struct mem_cgroup *memcg;
+       struct mem_cgroup *memcg = NULL;
         struct page_cgroup *pc;
         enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
  
@@ -3383,11 +3383,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
         pc = lookup_page_cgroup(oldpage);
         /* fix accounting on old pages */
         lock_page_cgroup(pc);
-       memcg = pc->mem_cgroup;
-       mem_cgroup_charge_statistics(memcg, false, -1);
-       ClearPageCgroupUsed(pc);
+       if (PageCgroupUsed(pc)) {
+               memcg = pc->mem_cgroup;
+               mem_cgroup_charge_statistics(memcg, false, -1);
+               ClearPageCgroupUsed(pc);
+       }
         unlock_page_cgroup(pc);
  
+       /*
+        * When called from shmem_replace_page(), in some cases the
+        * oldpage has already been charged, and in some cases not.
+        */
+       if (!memcg)
+               return;
+
         if (PageSwapBacked(oldpage))
                 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
  
diff --git a/mm/shmem.c b/mm/shmem.c

index be5af34a070dcdb73e9cad2accea606aeae53336..db72d8e44ec62d3d7871deb22c5bef24b5a86866 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -103,6 +103,9 @@ static unsigned long shmem_default_max_inodes(void)
  }
  #endif
  
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                               struct shmem_inode_info *info, pgoff_t index);
  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
         struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
  
@@ -604,12 +607,13 @@ static void shmem_evict_inode(struct inode *inode)
   * If swap found in inode, free it and move page from swapcache to filecache.
   */
  static int shmem_unuse_inode(struct shmem_inode_info *info,
-                            swp_entry_t swap, struct page *page)
+                            swp_entry_t swap, struct page **pagep)
  {
         struct address_space *mapping = info->vfs_inode.i_mapping;
         void *radswap;
         pgoff_t index;
-       int error;
+       gfp_t gfp;
+       int error = 0;
  
         radswap = swp_to_radix_entry(swap);
         index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +629,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
         if (shmem_swaplist.next != &info->swaplist)
                 list_move_tail(&shmem_swaplist, &info->swaplist);
  
+       gfp = mapping_gfp_mask(mapping);
+       if (shmem_should_replace_page(*pagep, gfp)) {
+               mutex_unlock(&shmem_swaplist_mutex);
+               error = shmem_replace_page(pagep, gfp, info, index);
+               mutex_lock(&shmem_swaplist_mutex);
+               /*
+                * We needed to drop mutex to make that restrictive page
+                * allocation; but the inode might already be freed by now,
+                * and we cannot refer to inode or mapping or info to check.
+                * However, we do hold page lock on the PageSwapCache page,
+                * so can check if that still has our reference remaining.
+                */
+               if (!page_swapcount(*pagep))
+                       error = -ENOENT;
+       }
+
         /*
          * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
          * but also to hold up shmem_evict_inode(): so inode cannot be freed
          * beneath us (pagelock doesn't help until the page is in pagecache).
          */
-       error = shmem_add_to_page_cache(page, mapping, index,
+       if (!error)
+               error = shmem_add_to_page_cache(*pagep, mapping, index,
                                                 GFP_NOWAIT, radswap);
-       /* which does mem_cgroup_uncharge_cache_page on error */
-
         if (error != -ENOMEM) {
                 /*
                  * Truncation and eviction use free_swap_and_cache(), which
                  * only does trylock page: if we raced, best clean up here.
                  */
-               delete_from_swap_cache(page);
-               set_page_dirty(page);
+               delete_from_swap_cache(*pagep);
+               set_page_dirty(*pagep);
                 if (!error) {
                         spin_lock(&info->lock);
                         info->swapped--;
@@ -660,7 +679,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         struct list_head *this, *next;
         struct shmem_inode_info *info;
         int found = 0;
-       int error;
+       int error = 0;
+
+       /*
+        * There's a faint possibility that swap page was replaced before
+        * caller locked it: it will come back later with the right page.
+        */
+       if (unlikely(!PageSwapCache(page)))
+               goto out;
  
         /*
          * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +702,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         list_for_each_safe(this, next, &shmem_swaplist) {
                 info = list_entry(this, struct shmem_inode_info, swaplist);
                 if (info->swapped)
-                       found = shmem_unuse_inode(info, swap, page);
+                       found = shmem_unuse_inode(info, swap, &page);
                 else
                         list_del_init(&info->swaplist);
                 cond_resched();
@@ -685,8 +711,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
         }
         mutex_unlock(&shmem_swaplist_mutex);
  
-       if (!found)
-               mem_cgroup_uncharge_cache_page(page);
         if (found < 0)
                 error = found;
  out:
@@ -855,6 +879,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
  }
  #endif
  
+/*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to.  If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+       return page_zonenum(page) > gfp_zone(gfp);
+}
+
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                               struct shmem_inode_info *info, pgoff_t index)
+{
+       struct page *oldpage, *newpage;
+       struct address_space *swap_mapping;
+       pgoff_t swap_index;
+       int error;
+
+       oldpage = *pagep;
+       swap_index = page_private(oldpage);
+       swap_mapping = page_mapping(oldpage);
+
+       /*
+        * We have arrived here because our zones are constrained, so don't
+        * limit chance of success by further cpuset and node constraints.
+        */
+       gfp &= ~GFP_CONSTRAINT_MASK;
+       newpage = shmem_alloc_page(gfp, info, index);
+       if (!newpage)
+               return -ENOMEM;
+       VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
+
+       *pagep = newpage;
+       page_cache_get(newpage);
+       copy_highpage(newpage, oldpage);
+
+       VM_BUG_ON(!PageLocked(oldpage));
+       __set_page_locked(newpage);
+       VM_BUG_ON(!PageUptodate(oldpage));
+       SetPageUptodate(newpage);
+       VM_BUG_ON(!PageSwapBacked(oldpage));
+       SetPageSwapBacked(newpage);
+       VM_BUG_ON(!swap_index);
+       set_page_private(newpage, swap_index);
+       VM_BUG_ON(!PageSwapCache(oldpage));
+       SetPageSwapCache(newpage);
+
+       /*
+        * Our caller will very soon move newpage out of swapcache, but it's
+        * a nice clean interface for us to replace oldpage by newpage there.
+        */
+       spin_lock_irq(&swap_mapping->tree_lock);
+       error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+                                                                  newpage);
+       __inc_zone_page_state(newpage, NR_FILE_PAGES);
+       __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+       spin_unlock_irq(&swap_mapping->tree_lock);
+       BUG_ON(error);
+
+       mem_cgroup_replace_page_cache(oldpage, newpage);
+       lru_cache_add_anon(newpage);
+
+       ClearPageSwapCache(oldpage);
+       set_page_private(oldpage, 0);
+
+       unlock_page(oldpage);
+       page_cache_release(oldpage);
+       page_cache_release(oldpage);
+       return 0;
+}
+
  /*
   * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
   *
@@ -923,19 +1025,20 @@ repeat:
  
                 /* We have to do this with page locked to prevent races */
                 lock_page(page);
+               if (!PageSwapCache(page) || page->mapping) {
+                       error = -EEXIST;        /* try again */
+                       goto failed;
+               }
                 if (!PageUptodate(page)) {
                         error = -EIO;
                         goto failed;
                 }
                 wait_on_page_writeback(page);
  
-               /* Someone may have already done it for us */
-               if (page->mapping) {
-                       if (page->mapping == mapping &&
-                           page->index == index)
-                               goto done;
-                       error = -EEXIST;
-                       goto failed;
+               if (shmem_should_replace_page(page, gfp)) {
+                       error = shmem_replace_page(&page, gfp, info, index);
+                       if (error)
+                               goto failed;
                 }
  
                 error = mem_cgroup_cache_charge(page, current->mm,
@@ -998,7 +1101,7 @@ repeat:
                 if (sgp == SGP_DIRTY)
                         set_page_dirty(page);
         }
-done:
+
         /* Perhaps the file has been truncated since we checked */
         if (sgp != SGP_WRITE &&
             ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
diff --git a/mm/swapfile.c b/mm/swapfile.c

index fafc26d1b1dc885d2541eda3bdc5705a4fe56012..b0c86e92f42cdbc8f1026820953796112d72283c 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
   * This does not give an exact answer when swap count is continued,
   * but does include the high COUNT_CONTINUED flag to allow for that.
   */
-static inline int page_swapcount(struct page *page)
+int page_swapcount(struct page *page)
  {
         int count = 0;
         struct swap_info_struct *p;
author	Hugh Dickins <hughd@google.com>
	Tue, 29 May 2012 22:06:38 +0000 (15:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 29 May 2012 23:22:22 +0000 (16:22 -0700)
include/linux/swap.h		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history