mm: vmscan: stall page reclaim after a list of pages have been processed

[~andy/linux] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 999ef0b9399ae0e31ec579da43e6c9a76c380cc5..5b1a79c8f0cb605d4b965d0307d40f5f23d58a23 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -697,6 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                       enum ttu_flags ttu_flags,
                                       unsigned long *ret_nr_unqueued_dirty,
                                       unsigned long *ret_nr_writeback,
+                                     unsigned long *ret_nr_immediate,
                                       bool force_reclaim)
  {
         LIST_HEAD(ret_pages);
@@ -707,6 +708,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
         unsigned long nr_congested = 0;
         unsigned long nr_reclaimed = 0;
         unsigned long nr_writeback = 0;
+       unsigned long nr_immediate = 0;
  
         cond_resched();
  
@@ -773,8 +775,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                  *    IO can complete. Waiting on the page itself risks an
                  *    indefinite stall if it is impossible to writeback the
                  *    page due to IO error or disconnected storage so instead
-                *    block for HZ/10 or until some IO completes then clear the
-                *    ZONE_WRITEBACK flag to recheck if the condition exists.
+                *    note that the LRU is being scanned too quickly and the
+                *    caller can stall after page list has been processed.
                  *
                  * 2) Global reclaim encounters a page, memcg encounters a
                  *    page that is not marked for immediate reclaim or
@@ -804,10 +806,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         if (current_is_kswapd() &&
                             PageReclaim(page) &&
                             zone_is_reclaim_writeback(zone)) {
-                               unlock_page(page);
-                               congestion_wait(BLK_RW_ASYNC, HZ/10);
-                               zone_clear_flag(zone, ZONE_WRITEBACK);
-                               goto keep;
+                               nr_immediate++;
+                               goto keep_locked;
  
                         /* Case 2 above */
                         } else if (global_reclaim(sc) ||
@@ -1033,6 +1033,7 @@ keep:
         mem_cgroup_uncharge_end();
         *ret_nr_unqueued_dirty += nr_unqueued_dirty;
         *ret_nr_writeback += nr_writeback;
+       *ret_nr_immediate += nr_immediate;
         return nr_reclaimed;
  }
  
@@ -1044,7 +1045,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
                 .priority = DEF_PRIORITY,
                 .may_unmap = 1,
         };
-       unsigned long ret, dummy1, dummy2;
+       unsigned long ret, dummy1, dummy2, dummy3;
         struct page *page, *next;
         LIST_HEAD(clean_pages);
  
@@ -1057,7 +1058,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
  
         ret = shrink_page_list(&clean_pages, zone, &sc,
                                 TTU_UNMAP|TTU_IGNORE_ACCESS,
-                               &dummy1, &dummy2, true);
+                               &dummy1, &dummy2, &dummy3, true);
         list_splice(&clean_pages, page_list);
         __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
         return ret;
@@ -1353,6 +1354,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         unsigned long nr_taken;
         unsigned long nr_unqueued_dirty = 0;
         unsigned long nr_writeback = 0;
+       unsigned long nr_immediate = 0;
         isolate_mode_t isolate_mode = 0;
         int file = is_file_lru(lru);
         struct zone *zone = lruvec_zone(lruvec);
@@ -1394,7 +1396,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 return 0;
  
         nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
-                               &nr_unqueued_dirty, &nr_writeback, false);
+                       &nr_unqueued_dirty, &nr_writeback, &nr_immediate,
+                       false);
  
         spin_lock_irq(&zone->lru_lock);
  
@@ -1447,14 +1450,28 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         }
  
         /*
-        * Similarly, if many dirty pages are encountered that are not
-        * currently being written then flag that kswapd should start
-        * writing back pages and stall to give a chance for flushers
-        * to catch up.
+        * memcg will stall in page writeback so only consider forcibly
+        * stalling for global reclaim
          */
-       if (global_reclaim(sc) && nr_unqueued_dirty == nr_taken) {
-               congestion_wait(BLK_RW_ASYNC, HZ/10);
-               zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+       if (global_reclaim(sc)) {
+               /*
+                * If dirty pages are scanned that are not queued for IO, it
+                * implies that flushers are not keeping up. In this case, flag
+                * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
+                * pages from reclaim context. It will forcibly stall in the
+                * next check.
+                */
+               if (nr_unqueued_dirty == nr_taken)
+                       zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+
+               /*
+                * In addition, if kswapd scans pages marked marked for
+                * immediate reclaim and under writeback (nr_immediate), it
+                * implies that pages are cycling through the LRU faster than
+                * they are written so also forcibly stall.
+                */
+               if (nr_unqueued_dirty == nr_taken || nr_immediate)
+                       congestion_wait(BLK_RW_ASYNC, HZ/10);
         }
  
         trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,