]> Pileus Git - ~andy/linux/blob - mm/memcontrol.c
mm: memcg: per-priority per-zone hierarchy scan generations
[~andy/linux] / mm / memcontrol.c
1 /* memcontrol.c - Memory Controller
2  *
3  * Copyright IBM Corporation, 2007
4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5  *
6  * Copyright 2007 OpenVZ SWsoft Inc
7  * Author: Pavel Emelianov <xemul@openvz.org>
8  *
9  * Memory thresholds
10  * Copyright (C) 2009 Nokia Corporation
11  * Author: Kirill A. Shutemov
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  */
23
24 #include <linux/res_counter.h>
25 #include <linux/memcontrol.h>
26 #include <linux/cgroup.h>
27 #include <linux/mm.h>
28 #include <linux/hugetlb.h>
29 #include <linux/pagemap.h>
30 #include <linux/smp.h>
31 #include <linux/page-flags.h>
32 #include <linux/backing-dev.h>
33 #include <linux/bit_spinlock.h>
34 #include <linux/rcupdate.h>
35 #include <linux/limits.h>
36 #include <linux/export.h>
37 #include <linux/mutex.h>
38 #include <linux/rbtree.h>
39 #include <linux/slab.h>
40 #include <linux/swap.h>
41 #include <linux/swapops.h>
42 #include <linux/spinlock.h>
43 #include <linux/eventfd.h>
44 #include <linux/sort.h>
45 #include <linux/fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/vmalloc.h>
48 #include <linux/mm_inline.h>
49 #include <linux/page_cgroup.h>
50 #include <linux/cpu.h>
51 #include <linux/oom.h>
52 #include "internal.h"
53 #include <net/sock.h>
54 #include <net/tcp_memcontrol.h>
55
56 #include <asm/uaccess.h>
57
58 #include <trace/events/vmscan.h>
59
60 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61 #define MEM_CGROUP_RECLAIM_RETRIES      5
62 struct mem_cgroup *root_mem_cgroup __read_mostly;
63
64 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
66 int do_swap_account __read_mostly;
67
68 /* for remember boot option*/
69 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
70 static int really_do_swap_account __initdata = 1;
71 #else
72 static int really_do_swap_account __initdata = 0;
73 #endif
74
75 #else
76 #define do_swap_account         (0)
77 #endif
78
79
80 /*
81  * Statistics for memory cgroup.
82  */
83 enum mem_cgroup_stat_index {
84         /*
85          * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
86          */
87         MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
88         MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
89         MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
90         MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91         MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92         MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
93         MEM_CGROUP_STAT_NSTATS,
94 };
95
96 enum mem_cgroup_events_index {
97         MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
98         MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
99         MEM_CGROUP_EVENTS_COUNT,        /* # of pages paged in/out */
100         MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
101         MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
102         MEM_CGROUP_EVENTS_NSTATS,
103 };
104 /*
105  * Per memcg event counter is incremented at every pagein/pageout. With THP,
106  * it will be incremated by the number of pages. This counter is used for
107  * for trigger some periodic events. This is straightforward and better
108  * than using jiffies etc. to handle periodic memcg event.
109  */
110 enum mem_cgroup_events_target {
111         MEM_CGROUP_TARGET_THRESH,
112         MEM_CGROUP_TARGET_SOFTLIMIT,
113         MEM_CGROUP_TARGET_NUMAINFO,
114         MEM_CGROUP_NTARGETS,
115 };
116 #define THRESHOLDS_EVENTS_TARGET (128)
117 #define SOFTLIMIT_EVENTS_TARGET (1024)
118 #define NUMAINFO_EVENTS_TARGET  (1024)
119
120 struct mem_cgroup_stat_cpu {
121         long count[MEM_CGROUP_STAT_NSTATS];
122         unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
123         unsigned long targets[MEM_CGROUP_NTARGETS];
124 };
125
126 struct mem_cgroup_reclaim_iter {
127         /* css_id of the last scanned hierarchy member */
128         int position;
129         /* scan generation, increased every round-trip */
130         unsigned int generation;
131 };
132
133 /*
134  * per-zone information in memory controller.
135  */
136 struct mem_cgroup_per_zone {
137         /*
138          * spin_lock to protect the per cgroup LRU
139          */
140         struct list_head        lists[NR_LRU_LISTS];
141         unsigned long           count[NR_LRU_LISTS];
142
143         struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
144
145         struct zone_reclaim_stat reclaim_stat;
146         struct rb_node          tree_node;      /* RB tree node */
147         unsigned long long      usage_in_excess;/* Set to the value by which */
148                                                 /* the soft limit is exceeded*/
149         bool                    on_tree;
150         struct mem_cgroup       *mem;           /* Back pointer, we cannot */
151                                                 /* use container_of        */
152 };
153 /* Macro for accessing counter */
154 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
155
156 struct mem_cgroup_per_node {
157         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
158 };
159
160 struct mem_cgroup_lru_info {
161         struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
162 };
163
164 /*
165  * Cgroups above their limits are maintained in a RB-Tree, independent of
166  * their hierarchy representation
167  */
168
169 struct mem_cgroup_tree_per_zone {
170         struct rb_root rb_root;
171         spinlock_t lock;
172 };
173
174 struct mem_cgroup_tree_per_node {
175         struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
176 };
177
178 struct mem_cgroup_tree {
179         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
180 };
181
182 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
183
184 struct mem_cgroup_threshold {
185         struct eventfd_ctx *eventfd;
186         u64 threshold;
187 };
188
189 /* For threshold */
190 struct mem_cgroup_threshold_ary {
191         /* An array index points to threshold just below usage. */
192         int current_threshold;
193         /* Size of entries[] */
194         unsigned int size;
195         /* Array of thresholds */
196         struct mem_cgroup_threshold entries[0];
197 };
198
199 struct mem_cgroup_thresholds {
200         /* Primary thresholds array */
201         struct mem_cgroup_threshold_ary *primary;
202         /*
203          * Spare threshold array.
204          * This is needed to make mem_cgroup_unregister_event() "never fail".
205          * It must be able to store at least primary->size - 1 entries.
206          */
207         struct mem_cgroup_threshold_ary *spare;
208 };
209
210 /* for OOM */
211 struct mem_cgroup_eventfd_list {
212         struct list_head list;
213         struct eventfd_ctx *eventfd;
214 };
215
216 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
217 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
218
219 /*
220  * The memory controller data structure. The memory controller controls both
221  * page cache and RSS per cgroup. We would eventually like to provide
222  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
223  * to help the administrator determine what knobs to tune.
224  *
225  * TODO: Add a water mark for the memory controller. Reclaim will begin when
226  * we hit the water mark. May be even add a low water mark, such that
227  * no reclaim occurs from a cgroup at it's low water mark, this is
228  * a feature that will be implemented much later in the future.
229  */
230 struct mem_cgroup {
231         struct cgroup_subsys_state css;
232         /*
233          * the counter to account for memory usage
234          */
235         struct res_counter res;
236         /*
237          * the counter to account for mem+swap usage.
238          */
239         struct res_counter memsw;
240         /*
241          * Per cgroup active and inactive list, similar to the
242          * per zone LRU lists.
243          */
244         struct mem_cgroup_lru_info info;
245         int last_scanned_node;
246 #if MAX_NUMNODES > 1
247         nodemask_t      scan_nodes;
248         atomic_t        numainfo_events;
249         atomic_t        numainfo_updating;
250 #endif
251         /*
252          * Should the accounting and control be hierarchical, per subtree?
253          */
254         bool use_hierarchy;
255
256         bool            oom_lock;
257         atomic_t        under_oom;
258
259         atomic_t        refcnt;
260
261         int     swappiness;
262         /* OOM-Killer disable */
263         int             oom_kill_disable;
264
265         /* set when res.limit == memsw.limit */
266         bool            memsw_is_minimum;
267
268         /* protect arrays of thresholds */
269         struct mutex thresholds_lock;
270
271         /* thresholds for memory usage. RCU-protected */
272         struct mem_cgroup_thresholds thresholds;
273
274         /* thresholds for mem+swap usage. RCU-protected */
275         struct mem_cgroup_thresholds memsw_thresholds;
276
277         /* For oom notifier event fd */
278         struct list_head oom_notify;
279
280         /*
281          * Should we move charges of a task when a task is moved into this
282          * mem_cgroup ? And what type of charges should we move ?
283          */
284         unsigned long   move_charge_at_immigrate;
285         /*
286          * percpu counter.
287          */
288         struct mem_cgroup_stat_cpu *stat;
289         /*
290          * used when a cpu is offlined or other synchronizations
291          * See mem_cgroup_read_stat().
292          */
293         struct mem_cgroup_stat_cpu nocpu_base;
294         spinlock_t pcp_counter_lock;
295
296 #ifdef CONFIG_INET
297         struct tcp_memcontrol tcp_mem;
298 #endif
299 };
300
301 /* Stuffs for move charges at task migration. */
302 /*
303  * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
304  * left-shifted bitmap of these types.
305  */
306 enum move_type {
307         MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
308         MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
309         NR_MOVE_TYPE,
310 };
311
312 /* "mc" and its members are protected by cgroup_mutex */
313 static struct move_charge_struct {
314         spinlock_t        lock; /* for from, to */
315         struct mem_cgroup *from;
316         struct mem_cgroup *to;
317         unsigned long precharge;
318         unsigned long moved_charge;
319         unsigned long moved_swap;
320         struct task_struct *moving_task;        /* a task moving charges */
321         wait_queue_head_t waitq;                /* a waitq for other context */
322 } mc = {
323         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
324         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
325 };
326
327 static bool move_anon(void)
328 {
329         return test_bit(MOVE_CHARGE_TYPE_ANON,
330                                         &mc.to->move_charge_at_immigrate);
331 }
332
333 static bool move_file(void)
334 {
335         return test_bit(MOVE_CHARGE_TYPE_FILE,
336                                         &mc.to->move_charge_at_immigrate);
337 }
338
339 /*
340  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
341  * limit reclaim to prevent infinite loops, if they ever occur.
342  */
343 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
344 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
345
346 enum charge_type {
347         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
348         MEM_CGROUP_CHARGE_TYPE_MAPPED,
349         MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
350         MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
351         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
352         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
353         NR_CHARGE_TYPE,
354 };
355
356 /* for encoding cft->private value on file */
357 #define _MEM                    (0)
358 #define _MEMSWAP                (1)
359 #define _OOM_TYPE               (2)
360 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
361 #define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
362 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
363 /* Used for OOM nofiier */
364 #define OOM_CONTROL             (0)
365
366 /*
367  * Reclaim flags for mem_cgroup_hierarchical_reclaim
368  */
369 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT   0x0
370 #define MEM_CGROUP_RECLAIM_NOSWAP       (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
371 #define MEM_CGROUP_RECLAIM_SHRINK_BIT   0x1
372 #define MEM_CGROUP_RECLAIM_SHRINK       (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
373 #define MEM_CGROUP_RECLAIM_SOFT_BIT     0x2
374 #define MEM_CGROUP_RECLAIM_SOFT         (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
375
376 static void mem_cgroup_get(struct mem_cgroup *memcg);
377 static void mem_cgroup_put(struct mem_cgroup *memcg);
378
379 /* Writing them here to avoid exposing memcg's inner layout */
380 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
381 #ifdef CONFIG_INET
382 #include <net/sock.h>
383 #include <net/ip.h>
384
385 static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
386 void sock_update_memcg(struct sock *sk)
387 {
388         if (static_branch(&memcg_socket_limit_enabled)) {
389                 struct mem_cgroup *memcg;
390
391                 BUG_ON(!sk->sk_prot->proto_cgroup);
392
393                 /* Socket cloning can throw us here with sk_cgrp already
394                  * filled. It won't however, necessarily happen from
395                  * process context. So the test for root memcg given
396                  * the current task's memcg won't help us in this case.
397                  *
398                  * Respecting the original socket's memcg is a better
399                  * decision in this case.
400                  */
401                 if (sk->sk_cgrp) {
402                         BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
403                         mem_cgroup_get(sk->sk_cgrp->memcg);
404                         return;
405                 }
406
407                 rcu_read_lock();
408                 memcg = mem_cgroup_from_task(current);
409                 if (!mem_cgroup_is_root(memcg)) {
410                         mem_cgroup_get(memcg);
411                         sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
412                 }
413                 rcu_read_unlock();
414         }
415 }
416 EXPORT_SYMBOL(sock_update_memcg);
417
418 void sock_release_memcg(struct sock *sk)
419 {
420         if (static_branch(&memcg_socket_limit_enabled) && sk->sk_cgrp) {
421                 struct mem_cgroup *memcg;
422                 WARN_ON(!sk->sk_cgrp->memcg);
423                 memcg = sk->sk_cgrp->memcg;
424                 mem_cgroup_put(memcg);
425         }
426 }
427
428 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
429 {
430         if (!memcg || mem_cgroup_is_root(memcg))
431                 return NULL;
432
433         return &memcg->tcp_mem.cg_proto;
434 }
435 EXPORT_SYMBOL(tcp_proto_cgroup);
436 #endif /* CONFIG_INET */
437 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
438
439 static void drain_all_stock_async(struct mem_cgroup *memcg);
440
441 static struct mem_cgroup_per_zone *
442 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
443 {
444         return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
445 }
446
447 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
448 {
449         return &memcg->css;
450 }
451
452 static struct mem_cgroup_per_zone *
453 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
454 {
455         int nid = page_to_nid(page);
456         int zid = page_zonenum(page);
457
458         return mem_cgroup_zoneinfo(memcg, nid, zid);
459 }
460
461 static struct mem_cgroup_tree_per_zone *
462 soft_limit_tree_node_zone(int nid, int zid)
463 {
464         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
465 }
466
467 static struct mem_cgroup_tree_per_zone *
468 soft_limit_tree_from_page(struct page *page)
469 {
470         int nid = page_to_nid(page);
471         int zid = page_zonenum(page);
472
473         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
474 }
475
476 static void
477 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
478                                 struct mem_cgroup_per_zone *mz,
479                                 struct mem_cgroup_tree_per_zone *mctz,
480                                 unsigned long long new_usage_in_excess)
481 {
482         struct rb_node **p = &mctz->rb_root.rb_node;
483         struct rb_node *parent = NULL;
484         struct mem_cgroup_per_zone *mz_node;
485
486         if (mz->on_tree)
487                 return;
488
489         mz->usage_in_excess = new_usage_in_excess;
490         if (!mz->usage_in_excess)
491                 return;
492         while (*p) {
493                 parent = *p;
494                 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
495                                         tree_node);
496                 if (mz->usage_in_excess < mz_node->usage_in_excess)
497                         p = &(*p)->rb_left;
498                 /*
499                  * We can't avoid mem cgroups that are over their soft
500                  * limit by the same amount
501                  */
502                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
503                         p = &(*p)->rb_right;
504         }
505         rb_link_node(&mz->tree_node, parent, p);
506         rb_insert_color(&mz->tree_node, &mctz->rb_root);
507         mz->on_tree = true;
508 }
509
510 static void
511 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
512                                 struct mem_cgroup_per_zone *mz,
513                                 struct mem_cgroup_tree_per_zone *mctz)
514 {
515         if (!mz->on_tree)
516                 return;
517         rb_erase(&mz->tree_node, &mctz->rb_root);
518         mz->on_tree = false;
519 }
520
521 static void
522 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
523                                 struct mem_cgroup_per_zone *mz,
524                                 struct mem_cgroup_tree_per_zone *mctz)
525 {
526         spin_lock(&mctz->lock);
527         __mem_cgroup_remove_exceeded(memcg, mz, mctz);
528         spin_unlock(&mctz->lock);
529 }
530
531
532 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
533 {
534         unsigned long long excess;
535         struct mem_cgroup_per_zone *mz;
536         struct mem_cgroup_tree_per_zone *mctz;
537         int nid = page_to_nid(page);
538         int zid = page_zonenum(page);
539         mctz = soft_limit_tree_from_page(page);
540
541         /*
542          * Necessary to update all ancestors when hierarchy is used.
543          * because their event counter is not touched.
544          */
545         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
546                 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
547                 excess = res_counter_soft_limit_excess(&memcg->res);
548                 /*
549                  * We have to update the tree if mz is on RB-tree or
550                  * mem is over its softlimit.
551                  */
552                 if (excess || mz->on_tree) {
553                         spin_lock(&mctz->lock);
554                         /* if on-tree, remove it */
555                         if (mz->on_tree)
556                                 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
557                         /*
558                          * Insert again. mz->usage_in_excess will be updated.
559                          * If excess is 0, no tree ops.
560                          */
561                         __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
562                         spin_unlock(&mctz->lock);
563                 }
564         }
565 }
566
567 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
568 {
569         int node, zone;
570         struct mem_cgroup_per_zone *mz;
571         struct mem_cgroup_tree_per_zone *mctz;
572
573         for_each_node_state(node, N_POSSIBLE) {
574                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
575                         mz = mem_cgroup_zoneinfo(memcg, node, zone);
576                         mctz = soft_limit_tree_node_zone(node, zone);
577                         mem_cgroup_remove_exceeded(memcg, mz, mctz);
578                 }
579         }
580 }
581
582 static struct mem_cgroup_per_zone *
583 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
584 {
585         struct rb_node *rightmost = NULL;
586         struct mem_cgroup_per_zone *mz;
587
588 retry:
589         mz = NULL;
590         rightmost = rb_last(&mctz->rb_root);
591         if (!rightmost)
592                 goto done;              /* Nothing to reclaim from */
593
594         mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
595         /*
596          * Remove the node now but someone else can add it back,
597          * we will to add it back at the end of reclaim to its correct
598          * position in the tree.
599          */
600         __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
601         if (!res_counter_soft_limit_excess(&mz->mem->res) ||
602                 !css_tryget(&mz->mem->css))
603                 goto retry;
604 done:
605         return mz;
606 }
607
608 static struct mem_cgroup_per_zone *
609 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
610 {
611         struct mem_cgroup_per_zone *mz;
612
613         spin_lock(&mctz->lock);
614         mz = __mem_cgroup_largest_soft_limit_node(mctz);
615         spin_unlock(&mctz->lock);
616         return mz;
617 }
618
619 /*
620  * Implementation Note: reading percpu statistics for memcg.
621  *
622  * Both of vmstat[] and percpu_counter has threshold and do periodic
623  * synchronization to implement "quick" read. There are trade-off between
624  * reading cost and precision of value. Then, we may have a chance to implement
625  * a periodic synchronizion of counter in memcg's counter.
626  *
627  * But this _read() function is used for user interface now. The user accounts
628  * memory usage by memory cgroup and he _always_ requires exact value because
629  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
630  * have to visit all online cpus and make sum. So, for now, unnecessary
631  * synchronization is not implemented. (just implemented for cpu hotplug)
632  *
633  * If there are kernel internal actions which can make use of some not-exact
634  * value, and reading all cpu value can be performance bottleneck in some
635  * common workload, threashold and synchonization as vmstat[] should be
636  * implemented.
637  */
638 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
639                                  enum mem_cgroup_stat_index idx)
640 {
641         long val = 0;
642         int cpu;
643
644         get_online_cpus();
645         for_each_online_cpu(cpu)
646                 val += per_cpu(memcg->stat->count[idx], cpu);
647 #ifdef CONFIG_HOTPLUG_CPU
648         spin_lock(&memcg->pcp_counter_lock);
649         val += memcg->nocpu_base.count[idx];
650         spin_unlock(&memcg->pcp_counter_lock);
651 #endif
652         put_online_cpus();
653         return val;
654 }
655
656 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
657                                          bool charge)
658 {
659         int val = (charge) ? 1 : -1;
660         this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
661 }
662
663 void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
664 {
665         this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
666 }
667
668 void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
669 {
670         this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
671 }
672
673 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
674                                             enum mem_cgroup_events_index idx)
675 {
676         unsigned long val = 0;
677         int cpu;
678
679         for_each_online_cpu(cpu)
680                 val += per_cpu(memcg->stat->events[idx], cpu);
681 #ifdef CONFIG_HOTPLUG_CPU
682         spin_lock(&memcg->pcp_counter_lock);
683         val += memcg->nocpu_base.events[idx];
684         spin_unlock(&memcg->pcp_counter_lock);
685 #endif
686         return val;
687 }
688
689 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
690                                          bool file, int nr_pages)
691 {
692         preempt_disable();
693
694         if (file)
695                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
696                                 nr_pages);
697         else
698                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
699                                 nr_pages);
700
701         /* pagein of a big page is an event. So, ignore page size */
702         if (nr_pages > 0)
703                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
704         else {
705                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
706                 nr_pages = -nr_pages; /* for event */
707         }
708
709         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
710
711         preempt_enable();
712 }
713
714 unsigned long
715 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
716                         unsigned int lru_mask)
717 {
718         struct mem_cgroup_per_zone *mz;
719         enum lru_list l;
720         unsigned long ret = 0;
721
722         mz = mem_cgroup_zoneinfo(memcg, nid, zid);
723
724         for_each_lru(l) {
725                 if (BIT(l) & lru_mask)
726                         ret += MEM_CGROUP_ZSTAT(mz, l);
727         }
728         return ret;
729 }
730
731 static unsigned long
732 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
733                         int nid, unsigned int lru_mask)
734 {
735         u64 total = 0;
736         int zid;
737
738         for (zid = 0; zid < MAX_NR_ZONES; zid++)
739                 total += mem_cgroup_zone_nr_lru_pages(memcg,
740                                                 nid, zid, lru_mask);
741
742         return total;
743 }
744
745 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
746                         unsigned int lru_mask)
747 {
748         int nid;
749         u64 total = 0;
750
751         for_each_node_state(nid, N_HIGH_MEMORY)
752                 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
753         return total;
754 }
755
756 static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
757 {
758         unsigned long val, next;
759
760         val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
761         next = __this_cpu_read(memcg->stat->targets[target]);
762         /* from time_after() in jiffies.h */
763         return ((long)next - (long)val < 0);
764 }
765
766 static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
767 {
768         unsigned long val, next;
769
770         val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
771
772         switch (target) {
773         case MEM_CGROUP_TARGET_THRESH:
774                 next = val + THRESHOLDS_EVENTS_TARGET;
775                 break;
776         case MEM_CGROUP_TARGET_SOFTLIMIT:
777                 next = val + SOFTLIMIT_EVENTS_TARGET;
778                 break;
779         case MEM_CGROUP_TARGET_NUMAINFO:
780                 next = val + NUMAINFO_EVENTS_TARGET;
781                 break;
782         default:
783                 return;
784         }
785
786         __this_cpu_write(memcg->stat->targets[target], next);
787 }
788
789 /*
790  * Check events in order.
791  *
792  */
793 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
794 {
795         preempt_disable();
796         /* threshold event is triggered in finer grain than soft limit */
797         if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
798                 mem_cgroup_threshold(memcg);
799                 __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
800                 if (unlikely(__memcg_event_check(memcg,
801                              MEM_CGROUP_TARGET_SOFTLIMIT))) {
802                         mem_cgroup_update_tree(memcg, page);
803                         __mem_cgroup_target_update(memcg,
804                                                    MEM_CGROUP_TARGET_SOFTLIMIT);
805                 }
806 #if MAX_NUMNODES > 1
807                 if (unlikely(__memcg_event_check(memcg,
808                         MEM_CGROUP_TARGET_NUMAINFO))) {
809                         atomic_inc(&memcg->numainfo_events);
810                         __mem_cgroup_target_update(memcg,
811                                 MEM_CGROUP_TARGET_NUMAINFO);
812                 }
813 #endif
814         }
815         preempt_enable();
816 }
817
818 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
819 {
820         return container_of(cgroup_subsys_state(cont,
821                                 mem_cgroup_subsys_id), struct mem_cgroup,
822                                 css);
823 }
824
825 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
826 {
827         /*
828          * mm_update_next_owner() may clear mm->owner to NULL
829          * if it races with swapoff, page migration, etc.
830          * So this can be called with p == NULL.
831          */
832         if (unlikely(!p))
833                 return NULL;
834
835         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
836                                 struct mem_cgroup, css);
837 }
838
839 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
840 {
841         struct mem_cgroup *memcg = NULL;
842
843         if (!mm)
844                 return NULL;
845         /*
846          * Because we have no locks, mm->owner's may be being moved to other
847          * cgroup. We use css_tryget() here even if this looks
848          * pessimistic (rather than adding locks here).
849          */
850         rcu_read_lock();
851         do {
852                 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
853                 if (unlikely(!memcg))
854                         break;
855         } while (!css_tryget(&memcg->css));
856         rcu_read_unlock();
857         return memcg;
858 }
859
860 struct mem_cgroup_reclaim_cookie {
861         struct zone *zone;
862         int priority;
863         unsigned int generation;
864 };
865
866 static struct mem_cgroup *
867 mem_cgroup_iter(struct mem_cgroup *root,
868                 struct mem_cgroup *prev,
869                 struct mem_cgroup_reclaim_cookie *reclaim)
870 {
871         struct mem_cgroup *memcg = NULL;
872         int id = 0;
873
874         if (!root)
875                 root = root_mem_cgroup;
876
877         if (prev && !reclaim)
878                 id = css_id(&prev->css);
879
880         if (prev && prev != root)
881                 css_put(&prev->css);
882
883         if (!root->use_hierarchy && root != root_mem_cgroup) {
884                 if (prev)
885                         return NULL;
886                 return root;
887         }
888
889         while (!memcg) {
890                 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
891                 struct cgroup_subsys_state *css;
892
893                 if (reclaim) {
894                         int nid = zone_to_nid(reclaim->zone);
895                         int zid = zone_idx(reclaim->zone);
896                         struct mem_cgroup_per_zone *mz;
897
898                         mz = mem_cgroup_zoneinfo(root, nid, zid);
899                         iter = &mz->reclaim_iter[reclaim->priority];
900                         if (prev && reclaim->generation != iter->generation)
901                                 return NULL;
902                         id = iter->position;
903                 }
904
905                 rcu_read_lock();
906                 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
907                 if (css) {
908                         if (css == &root->css || css_tryget(css))
909                                 memcg = container_of(css,
910                                                      struct mem_cgroup, css);
911                 } else
912                         id = 0;
913                 rcu_read_unlock();
914
915                 if (reclaim) {
916                         iter->position = id;
917                         if (!css)
918                                 iter->generation++;
919                         else if (!prev && memcg)
920                                 reclaim->generation = iter->generation;
921                 }
922
923                 if (prev && !css)
924                         return NULL;
925         }
926         return memcg;
927 }
928
929 static void mem_cgroup_iter_break(struct mem_cgroup *root,
930                                   struct mem_cgroup *prev)
931 {
932         if (!root)
933                 root = root_mem_cgroup;
934         if (prev && prev != root)
935                 css_put(&prev->css);
936 }
937
938 /*
939  * Iteration constructs for visiting all cgroups (under a tree).  If
940  * loops are exited prematurely (break), mem_cgroup_iter_break() must
941  * be used for reference counting.
942  */
943 #define for_each_mem_cgroup_tree(iter, root)            \
944         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
945              iter != NULL;                              \
946              iter = mem_cgroup_iter(root, iter, NULL))
947
948 #define for_each_mem_cgroup(iter)                       \
949         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
950              iter != NULL;                              \
951              iter = mem_cgroup_iter(NULL, iter, NULL))
952
953 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
954 {
955         return (memcg == root_mem_cgroup);
956 }
957
958 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
959 {
960         struct mem_cgroup *memcg;
961
962         if (!mm)
963                 return;
964
965         rcu_read_lock();
966         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
967         if (unlikely(!memcg))
968                 goto out;
969
970         switch (idx) {
971         case PGMAJFAULT:
972                 mem_cgroup_pgmajfault(memcg, 1);
973                 break;
974         case PGFAULT:
975                 mem_cgroup_pgfault(memcg, 1);
976                 break;
977         default:
978                 BUG();
979         }
980 out:
981         rcu_read_unlock();
982 }
983 EXPORT_SYMBOL(mem_cgroup_count_vm_event);
984
985 /*
986  * Following LRU functions are allowed to be used without PCG_LOCK.
987  * Operations are called by routine of global LRU independently from memcg.
988  * What we have to take care of here is validness of pc->mem_cgroup.
989  *
990  * Changes to pc->mem_cgroup happens when
991  * 1. charge
992  * 2. moving account
993  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
994  * It is added to LRU before charge.
995  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
996  * When moving account, the page is not on LRU. It's isolated.
997  */
998
999 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
1000 {
1001         struct page_cgroup *pc;
1002         struct mem_cgroup_per_zone *mz;
1003
1004         if (mem_cgroup_disabled())
1005                 return;
1006         pc = lookup_page_cgroup(page);
1007         /* can happen while we handle swapcache. */
1008         if (!TestClearPageCgroupAcctLRU(pc))
1009                 return;
1010         VM_BUG_ON(!pc->mem_cgroup);
1011         /*
1012          * We don't check PCG_USED bit. It's cleared when the "page" is finally
1013          * removed from global LRU.
1014          */
1015         mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1016         /* huge page split is done under lru_lock. so, we have no races. */
1017         MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
1018         if (mem_cgroup_is_root(pc->mem_cgroup))
1019                 return;
1020         VM_BUG_ON(list_empty(&pc->lru));
1021         list_del_init(&pc->lru);
1022 }
1023
1024 void mem_cgroup_del_lru(struct page *page)
1025 {
1026         mem_cgroup_del_lru_list(page, page_lru(page));
1027 }
1028
1029 /*
1030  * Writeback is about to end against a page which has been marked for immediate
1031  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
1032  * inactive list.
1033  */
1034 void mem_cgroup_rotate_reclaimable_page(struct page *page)
1035 {
1036         struct mem_cgroup_per_zone *mz;
1037         struct page_cgroup *pc;
1038         enum lru_list lru = page_lru(page);
1039
1040         if (mem_cgroup_disabled())
1041                 return;
1042
1043         pc = lookup_page_cgroup(page);
1044         /* unused or root page is not rotated. */
1045         if (!PageCgroupUsed(pc))
1046                 return;
1047         /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1048         smp_rmb();
1049         if (mem_cgroup_is_root(pc->mem_cgroup))
1050                 return;
1051         mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1052         list_move_tail(&pc->lru, &mz->lists[lru]);
1053 }
1054
1055 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
1056 {
1057         struct mem_cgroup_per_zone *mz;
1058         struct page_cgroup *pc;
1059
1060         if (mem_cgroup_disabled())
1061                 return;
1062
1063         pc = lookup_page_cgroup(page);
1064         /* unused or root page is not rotated. */
1065         if (!PageCgroupUsed(pc))
1066                 return;
1067         /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1068         smp_rmb();
1069         if (mem_cgroup_is_root(pc->mem_cgroup))
1070                 return;
1071         mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1072         list_move(&pc->lru, &mz->lists[lru]);
1073 }
1074
1075 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
1076 {
1077         struct page_cgroup *pc;
1078         struct mem_cgroup_per_zone *mz;
1079
1080         if (mem_cgroup_disabled())
1081                 return;
1082         pc = lookup_page_cgroup(page);
1083         VM_BUG_ON(PageCgroupAcctLRU(pc));
1084         /*
1085          * putback:                             charge:
1086          * SetPageLRU                           SetPageCgroupUsed
1087          * smp_mb                               smp_mb
1088          * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
1089          *
1090          * Ensure that one of the two sides adds the page to the memcg
1091          * LRU during a race.
1092          */
1093         smp_mb();
1094         if (!PageCgroupUsed(pc))
1095                 return;
1096         /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1097         smp_rmb();
1098         mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1099         /* huge page split is done under lru_lock. so, we have no races. */
1100         MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1101         SetPageCgroupAcctLRU(pc);
1102         if (mem_cgroup_is_root(pc->mem_cgroup))
1103                 return;
1104         list_add(&pc->lru, &mz->lists[lru]);
1105 }
1106
1107 /*
1108  * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1109  * while it's linked to lru because the page may be reused after it's fully
1110  * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1111  * It's done under lock_page and expected that zone->lru_lock isnever held.
1112  */
1113 static void mem_cgroup_lru_del_before_commit(struct page *page)
1114 {
1115         unsigned long flags;
1116         struct zone *zone = page_zone(page);
1117         struct page_cgroup *pc = lookup_page_cgroup(page);
1118
1119         /*
1120          * Doing this check without taking ->lru_lock seems wrong but this
1121          * is safe. Because if page_cgroup's USED bit is unset, the page
1122          * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1123          * set, the commit after this will fail, anyway.
1124          * This all charge/uncharge is done under some mutual execustion.
1125          * So, we don't need to taking care of changes in USED bit.
1126          */
1127         if (likely(!PageLRU(page)))
1128                 return;
1129
1130         spin_lock_irqsave(&zone->lru_lock, flags);
1131         /*
1132          * Forget old LRU when this page_cgroup is *not* used. This Used bit
1133          * is guarded by lock_page() because the page is SwapCache.
1134          */
1135         if (!PageCgroupUsed(pc))
1136                 mem_cgroup_del_lru_list(page, page_lru(page));
1137         spin_unlock_irqrestore(&zone->lru_lock, flags);
1138 }
1139
1140 static void mem_cgroup_lru_add_after_commit(struct page *page)
1141 {
1142         unsigned long flags;
1143         struct zone *zone = page_zone(page);
1144         struct page_cgroup *pc = lookup_page_cgroup(page);
1145         /*
1146          * putback:                             charge:
1147          * SetPageLRU                           SetPageCgroupUsed
1148          * smp_mb                               smp_mb
1149          * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
1150          *
1151          * Ensure that one of the two sides adds the page to the memcg
1152          * LRU during a race.
1153          */
1154         smp_mb();
1155         /* taking care of that the page is added to LRU while we commit it */
1156         if (likely(!PageLRU(page)))
1157                 return;
1158         spin_lock_irqsave(&zone->lru_lock, flags);
1159         /* link when the page is linked to LRU but page_cgroup isn't */
1160         if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1161                 mem_cgroup_add_lru_list(page, page_lru(page));
1162         spin_unlock_irqrestore(&zone->lru_lock, flags);
1163 }
1164
1165
1166 void mem_cgroup_move_lists(struct page *page,
1167                            enum lru_list from, enum lru_list to)
1168 {
1169         if (mem_cgroup_disabled())
1170                 return;
1171         mem_cgroup_del_lru_list(page, from);
1172         mem_cgroup_add_lru_list(page, to);
1173 }
1174
1175 /*
1176  * Checks whether given mem is same or in the root_mem_cgroup's
1177  * hierarchy subtree
1178  */
1179 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1180                 struct mem_cgroup *memcg)
1181 {
1182         if (root_memcg != memcg) {
1183                 return (root_memcg->use_hierarchy &&
1184                         css_is_ancestor(&memcg->css, &root_memcg->css));
1185         }
1186
1187         return true;
1188 }
1189
1190 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1191 {
1192         int ret;
1193         struct mem_cgroup *curr = NULL;
1194         struct task_struct *p;
1195
1196         p = find_lock_task_mm(task);
1197         if (!p)
1198                 return 0;
1199         curr = try_get_mem_cgroup_from_mm(p->mm);
1200         task_unlock(p);
1201         if (!curr)
1202                 return 0;
1203         /*
1204          * We should check use_hierarchy of "memcg" not "curr". Because checking
1205          * use_hierarchy of "curr" here make this function true if hierarchy is
1206          * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1207          * hierarchy(even if use_hierarchy is disabled in "memcg").
1208          */
1209         ret = mem_cgroup_same_or_subtree(memcg, curr);
1210         css_put(&curr->css);
1211         return ret;
1212 }
1213
1214 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1215 {
1216         unsigned long inactive_ratio;
1217         int nid = zone_to_nid(zone);
1218         int zid = zone_idx(zone);
1219         unsigned long inactive;
1220         unsigned long active;
1221         unsigned long gb;
1222
1223         inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1224                                                 BIT(LRU_INACTIVE_ANON));
1225         active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1226                                               BIT(LRU_ACTIVE_ANON));
1227
1228         gb = (inactive + active) >> (30 - PAGE_SHIFT);
1229         if (gb)
1230                 inactive_ratio = int_sqrt(10 * gb);
1231         else
1232                 inactive_ratio = 1;
1233
1234         return inactive * inactive_ratio < active;
1235 }
1236
1237 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
1238 {
1239         unsigned long active;
1240         unsigned long inactive;
1241         int zid = zone_idx(zone);
1242         int nid = zone_to_nid(zone);
1243
1244         inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1245                                                 BIT(LRU_INACTIVE_FILE));
1246         active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1247                                               BIT(LRU_ACTIVE_FILE));
1248
1249         return (active > inactive);
1250 }
1251
1252 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1253                                                       struct zone *zone)
1254 {
1255         int nid = zone_to_nid(zone);
1256         int zid = zone_idx(zone);
1257         struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1258
1259         return &mz->reclaim_stat;
1260 }
1261
1262 struct zone_reclaim_stat *
1263 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1264 {
1265         struct page_cgroup *pc;
1266         struct mem_cgroup_per_zone *mz;
1267
1268         if (mem_cgroup_disabled())
1269                 return NULL;
1270
1271         pc = lookup_page_cgroup(page);
1272         if (!PageCgroupUsed(pc))
1273                 return NULL;
1274         /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1275         smp_rmb();
1276         mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1277         return &mz->reclaim_stat;
1278 }
1279
1280 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1281                                         struct list_head *dst,
1282                                         unsigned long *scanned, int order,
1283                                         isolate_mode_t mode,
1284                                         struct zone *z,
1285                                         struct mem_cgroup *mem_cont,
1286                                         int active, int file)
1287 {
1288         unsigned long nr_taken = 0;
1289         struct page *page;
1290         unsigned long scan;
1291         LIST_HEAD(pc_list);
1292         struct list_head *src;
1293         struct page_cgroup *pc, *tmp;
1294         int nid = zone_to_nid(z);
1295         int zid = zone_idx(z);
1296         struct mem_cgroup_per_zone *mz;
1297         int lru = LRU_FILE * file + active;
1298         int ret;
1299
1300         BUG_ON(!mem_cont);
1301         mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1302         src = &mz->lists[lru];
1303
1304         scan = 0;
1305         list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1306                 if (scan >= nr_to_scan)
1307                         break;
1308
1309                 if (unlikely(!PageCgroupUsed(pc)))
1310                         continue;
1311
1312                 page = lookup_cgroup_page(pc);
1313
1314                 if (unlikely(!PageLRU(page)))
1315                         continue;
1316
1317                 scan++;
1318                 ret = __isolate_lru_page(page, mode, file);
1319                 switch (ret) {
1320                 case 0:
1321                         list_move(&page->lru, dst);
1322                         mem_cgroup_del_lru(page);
1323                         nr_taken += hpage_nr_pages(page);
1324                         break;
1325                 case -EBUSY:
1326                         /* we don't affect global LRU but rotate in our LRU */
1327                         mem_cgroup_rotate_lru_list(page, page_lru(page));
1328                         break;
1329                 default:
1330                         break;
1331                 }
1332         }
1333
1334         *scanned = scan;
1335
1336         trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1337                                       0, 0, 0, mode);
1338
1339         return nr_taken;
1340 }
1341
1342 #define mem_cgroup_from_res_counter(counter, member)    \
1343         container_of(counter, struct mem_cgroup, member)
1344
1345 /**
1346  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1347  * @mem: the memory cgroup
1348  *
1349  * Returns the maximum amount of memory @mem can be charged with, in
1350  * pages.
1351  */
1352 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1353 {
1354         unsigned long long margin;
1355
1356         margin = res_counter_margin(&memcg->res);
1357         if (do_swap_account)
1358                 margin = min(margin, res_counter_margin(&memcg->memsw));
1359         return margin >> PAGE_SHIFT;
1360 }
1361
1362 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1363 {
1364         struct cgroup *cgrp = memcg->css.cgroup;
1365
1366         /* root ? */
1367         if (cgrp->parent == NULL)
1368                 return vm_swappiness;
1369
1370         return memcg->swappiness;
1371 }
1372
1373 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1374 {
1375         int cpu;
1376
1377         get_online_cpus();
1378         spin_lock(&memcg->pcp_counter_lock);
1379         for_each_online_cpu(cpu)
1380                 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1381         memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1382         spin_unlock(&memcg->pcp_counter_lock);
1383         put_online_cpus();
1384
1385         synchronize_rcu();
1386 }
1387
1388 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1389 {
1390         int cpu;
1391
1392         if (!memcg)
1393                 return;
1394         get_online_cpus();
1395         spin_lock(&memcg->pcp_counter_lock);
1396         for_each_online_cpu(cpu)
1397                 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1398         memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1399         spin_unlock(&memcg->pcp_counter_lock);
1400         put_online_cpus();
1401 }
1402 /*
1403  * 2 routines for checking "mem" is under move_account() or not.
1404  *
1405  * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1406  *                        for avoiding race in accounting. If true,
1407  *                        pc->mem_cgroup may be overwritten.
1408  *
1409  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1410  *                        under hierarchy of moving cgroups. This is for
1411  *                        waiting at hith-memory prressure caused by "move".
1412  */
1413
1414 static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
1415 {
1416         VM_BUG_ON(!rcu_read_lock_held());
1417         return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1418 }
1419
1420 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1421 {
1422         struct mem_cgroup *from;
1423         struct mem_cgroup *to;
1424         bool ret = false;
1425         /*
1426          * Unlike task_move routines, we access mc.to, mc.from not under
1427          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1428          */
1429         spin_lock(&mc.lock);
1430         from = mc.from;
1431         to = mc.to;
1432         if (!from)
1433                 goto unlock;
1434
1435         ret = mem_cgroup_same_or_subtree(memcg, from)
1436                 || mem_cgroup_same_or_subtree(memcg, to);
1437 unlock:
1438         spin_unlock(&mc.lock);
1439         return ret;
1440 }
1441
1442 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1443 {
1444         if (mc.moving_task && current != mc.moving_task) {
1445                 if (mem_cgroup_under_move(memcg)) {
1446                         DEFINE_WAIT(wait);
1447                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1448                         /* moving charge context might have finished. */
1449                         if (mc.moving_task)
1450                                 schedule();
1451                         finish_wait(&mc.waitq, &wait);
1452                         return true;
1453                 }
1454         }
1455         return false;
1456 }
1457
1458 /**
1459  * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1460  * @memcg: The memory cgroup that went over limit
1461  * @p: Task that is going to be killed
1462  *
1463  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1464  * enabled
1465  */
1466 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1467 {
1468         struct cgroup *task_cgrp;
1469         struct cgroup *mem_cgrp;
1470         /*
1471          * Need a buffer in BSS, can't rely on allocations. The code relies
1472          * on the assumption that OOM is serialized for memory controller.
1473          * If this assumption is broken, revisit this code.
1474          */
1475         static char memcg_name[PATH_MAX];
1476         int ret;
1477
1478         if (!memcg || !p)
1479                 return;
1480
1481
1482         rcu_read_lock();
1483
1484         mem_cgrp = memcg->css.cgroup;
1485         task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1486
1487         ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1488         if (ret < 0) {
1489                 /*
1490                  * Unfortunately, we are unable to convert to a useful name
1491                  * But we'll still print out the usage information
1492                  */
1493                 rcu_read_unlock();
1494                 goto done;
1495         }
1496         rcu_read_unlock();
1497
1498         printk(KERN_INFO "Task in %s killed", memcg_name);
1499
1500         rcu_read_lock();
1501         ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1502         if (ret < 0) {
1503                 rcu_read_unlock();
1504                 goto done;
1505         }
1506         rcu_read_unlock();
1507
1508         /*
1509          * Continues from above, so we don't need an KERN_ level
1510          */
1511         printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1512 done:
1513
1514         printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1515                 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1516                 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1517                 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1518         printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1519                 "failcnt %llu\n",
1520                 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1521                 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1522                 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1523 }
1524
1525 /*
1526  * This function returns the number of memcg under hierarchy tree. Returns
1527  * 1(self count) if no children.
1528  */
1529 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1530 {
1531         int num = 0;
1532         struct mem_cgroup *iter;
1533
1534         for_each_mem_cgroup_tree(iter, memcg)
1535                 num++;
1536         return num;
1537 }
1538
1539 /*
1540  * Return the memory (and swap, if configured) limit for a memcg.
1541  */
1542 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1543 {
1544         u64 limit;
1545         u64 memsw;
1546
1547         limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1548         limit += total_swap_pages << PAGE_SHIFT;
1549
1550         memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1551         /*
1552          * If memsw is finite and limits the amount of swap space available
1553          * to this memcg, return that limit.
1554          */
1555         return min(limit, memsw);
1556 }
1557
1558 /**
1559  * test_mem_cgroup_node_reclaimable
1560  * @mem: the target memcg
1561  * @nid: the node ID to be checked.
1562  * @noswap : specify true here if the user wants flle only information.
1563  *
1564  * This function returns whether the specified memcg contains any
1565  * reclaimable pages on a node. Returns true if there are any reclaimable
1566  * pages in the node.
1567  */
1568 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1569                 int nid, bool noswap)
1570 {
1571         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1572                 return true;
1573         if (noswap || !total_swap_pages)
1574                 return false;
1575         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1576                 return true;
1577         return false;
1578
1579 }
1580 #if MAX_NUMNODES > 1
1581
1582 /*
1583  * Always updating the nodemask is not very good - even if we have an empty
1584  * list or the wrong list here, we can start from some node and traverse all
1585  * nodes based on the zonelist. So update the list loosely once per 10 secs.
1586  *
1587  */
1588 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1589 {
1590         int nid;
1591         /*
1592          * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1593          * pagein/pageout changes since the last update.
1594          */
1595         if (!atomic_read(&memcg->numainfo_events))
1596                 return;
1597         if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1598                 return;
1599
1600         /* make a nodemask where this memcg uses memory from */
1601         memcg->scan_nodes = node_states[N_HIGH_MEMORY];
1602
1603         for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1604
1605                 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1606                         node_clear(nid, memcg->scan_nodes);
1607         }
1608
1609         atomic_set(&memcg->numainfo_events, 0);
1610         atomic_set(&memcg->numainfo_updating, 0);
1611 }
1612
1613 /*
1614  * Selecting a node where we start reclaim from. Because what we need is just
1615  * reducing usage counter, start from anywhere is O,K. Considering
1616  * memory reclaim from current node, there are pros. and cons.
1617  *
1618  * Freeing memory from current node means freeing memory from a node which
1619  * we'll use or we've used. So, it may make LRU bad. And if several threads
1620  * hit limits, it will see a contention on a node. But freeing from remote
1621  * node means more costs for memory reclaim because of memory latency.
1622  *
1623  * Now, we use round-robin. Better algorithm is welcomed.
1624  */
1625 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1626 {
1627         int node;
1628
1629         mem_cgroup_may_update_nodemask(memcg);
1630         node = memcg->last_scanned_node;
1631
1632         node = next_node(node, memcg->scan_nodes);
1633         if (node == MAX_NUMNODES)
1634                 node = first_node(memcg->scan_nodes);
1635         /*
1636          * We call this when we hit limit, not when pages are added to LRU.
1637          * No LRU may hold pages because all pages are UNEVICTABLE or
1638          * memcg is too small and all pages are not on LRU. In that case,
1639          * we use curret node.
1640          */
1641         if (unlikely(node == MAX_NUMNODES))
1642                 node = numa_node_id();
1643
1644         memcg->last_scanned_node = node;
1645         return node;
1646 }
1647
1648 /*
1649  * Check all nodes whether it contains reclaimable pages or not.
1650  * For quick scan, we make use of scan_nodes. This will allow us to skip
1651  * unused nodes. But scan_nodes is lazily updated and may not cotain
1652  * enough new information. We need to do double check.
1653  */
1654 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1655 {
1656         int nid;
1657
1658         /*
1659          * quick check...making use of scan_node.
1660          * We can skip unused nodes.
1661          */
1662         if (!nodes_empty(memcg->scan_nodes)) {
1663                 for (nid = first_node(memcg->scan_nodes);
1664                      nid < MAX_NUMNODES;
1665                      nid = next_node(nid, memcg->scan_nodes)) {
1666
1667                         if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1668                                 return true;
1669                 }
1670         }
1671         /*
1672          * Check rest of nodes.
1673          */
1674         for_each_node_state(nid, N_HIGH_MEMORY) {
1675                 if (node_isset(nid, memcg->scan_nodes))
1676                         continue;
1677                 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1678                         return true;
1679         }
1680         return false;
1681 }
1682
1683 #else
1684 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1685 {
1686         return 0;
1687 }
1688
1689 bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1690 {
1691         return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1692 }
1693 #endif
1694
1695 /*
1696  * Scan the hierarchy if needed to reclaim memory. We remember the last child
1697  * we reclaimed from, so that we don't end up penalizing one child extensively
1698  * based on its position in the children list.
1699  *
1700  * root_memcg is the original ancestor that we've been reclaim from.
1701  *
1702  * We give up and return to the caller when we visit root_memcg twice.
1703  * (other groups can be removed while we're walking....)
1704  *
1705  * If shrink==true, for avoiding to free too much, this returns immedieately.
1706  */
1707 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1708                                                 struct zone *zone,
1709                                                 gfp_t gfp_mask,
1710                                                 unsigned long reclaim_options,
1711                                                 unsigned long *total_scanned)
1712 {
1713         struct mem_cgroup *victim = NULL;
1714         int ret, total = 0;
1715         int loop = 0;
1716         bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1717         bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1718         bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1719         unsigned long excess;
1720         unsigned long nr_scanned;
1721         struct mem_cgroup_reclaim_cookie reclaim = {
1722                 .zone = zone,
1723                 .priority = 0,
1724         };
1725
1726         excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1727
1728         /* If memsw_is_minimum==1, swap-out is of-no-use. */
1729         if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
1730                 noswap = true;
1731
1732         while (1) {
1733                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1734                 if (!victim) {
1735                         loop++;
1736                         /*
1737                          * We are not draining per cpu cached charges during
1738                          * soft limit reclaim  because global reclaim doesn't
1739                          * care about charges. It tries to free some memory and
1740                          * charges will not give any.
1741                          */
1742                         if (!check_soft && loop >= 1)
1743                                 drain_all_stock_async(root_memcg);
1744                         if (loop >= 2) {
1745                                 /*
1746                                  * If we have not been able to reclaim
1747                                  * anything, it might because there are
1748                                  * no reclaimable pages under this hierarchy
1749                                  */
1750                                 if (!check_soft || !total)
1751                                         break;
1752                                 /*
1753                                  * We want to do more targeted reclaim.
1754                                  * excess >> 2 is not to excessive so as to
1755                                  * reclaim too much, nor too less that we keep
1756                                  * coming back to reclaim from this cgroup
1757                                  */
1758                                 if (total >= (excess >> 2) ||
1759                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1760                                         break;
1761                         }
1762                         continue;
1763                 }
1764                 if (!mem_cgroup_reclaimable(victim, noswap)) {
1765                         /* this cgroup's local usage == 0 */
1766                         continue;
1767                 }
1768                 /* we use swappiness of local cgroup */
1769                 if (check_soft) {
1770                         ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1771                                 noswap, zone, &nr_scanned);
1772                         *total_scanned += nr_scanned;
1773                 } else
1774                         ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1775                                                 noswap);
1776                 total += ret;
1777                 /*
1778                  * At shrinking usage, we can't check we should stop here or
1779                  * reclaim more. It's depends on callers. last_scanned_child
1780                  * will work enough for keeping fairness under tree.
1781                  */
1782                 if (shrink)
1783                         break;
1784                 if (check_soft) {
1785                         if (!res_counter_soft_limit_excess(&root_memcg->res))
1786                                 break;
1787                 } else if (mem_cgroup_margin(root_memcg))
1788                         break;
1789         }
1790         mem_cgroup_iter_break(root_memcg, victim);
1791         return total;
1792 }
1793
1794 /*
1795  * Check OOM-Killer is already running under our hierarchy.
1796  * If someone is running, return false.
1797  * Has to be called with memcg_oom_lock
1798  */
1799 static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1800 {
1801         struct mem_cgroup *iter, *failed = NULL;
1802
1803         for_each_mem_cgroup_tree(iter, memcg) {
1804                 if (iter->oom_lock) {
1805                         /*
1806                          * this subtree of our hierarchy is already locked
1807                          * so we cannot give a lock.
1808                          */
1809                         failed = iter;
1810                         mem_cgroup_iter_break(memcg, iter);
1811                         break;
1812                 } else
1813                         iter->oom_lock = true;
1814         }
1815
1816         if (!failed)
1817                 return true;
1818
1819         /*
1820          * OK, we failed to lock the whole subtree so we have to clean up
1821          * what we set up to the failing subtree
1822          */
1823         for_each_mem_cgroup_tree(iter, memcg) {
1824                 if (iter == failed) {
1825                         mem_cgroup_iter_break(memcg, iter);
1826                         break;
1827                 }
1828                 iter->oom_lock = false;
1829         }
1830         return false;
1831 }
1832
1833 /*
1834  * Has to be called with memcg_oom_lock
1835  */
1836 static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1837 {
1838         struct mem_cgroup *iter;
1839
1840         for_each_mem_cgroup_tree(iter, memcg)
1841                 iter->oom_lock = false;
1842         return 0;
1843 }
1844
1845 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1846 {
1847         struct mem_cgroup *iter;
1848
1849         for_each_mem_cgroup_tree(iter, memcg)
1850                 atomic_inc(&iter->under_oom);
1851 }
1852
1853 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1854 {
1855         struct mem_cgroup *iter;
1856
1857         /*
1858          * When a new child is created while the hierarchy is under oom,
1859          * mem_cgroup_oom_lock() may not be called. We have to use
1860          * atomic_add_unless() here.
1861          */
1862         for_each_mem_cgroup_tree(iter, memcg)
1863                 atomic_add_unless(&iter->under_oom, -1, 0);
1864 }
1865
1866 static DEFINE_SPINLOCK(memcg_oom_lock);
1867 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1868
1869 struct oom_wait_info {
1870         struct mem_cgroup *mem;
1871         wait_queue_t    wait;
1872 };
1873
1874 static int memcg_oom_wake_function(wait_queue_t *wait,
1875         unsigned mode, int sync, void *arg)
1876 {
1877         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
1878                           *oom_wait_memcg;
1879         struct oom_wait_info *oom_wait_info;
1880
1881         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1882         oom_wait_memcg = oom_wait_info->mem;
1883
1884         /*
1885          * Both of oom_wait_info->mem and wake_mem are stable under us.
1886          * Then we can use css_is_ancestor without taking care of RCU.
1887          */
1888         if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
1889                 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
1890                 return 0;
1891         return autoremove_wake_function(wait, mode, sync, arg);
1892 }
1893
1894 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1895 {
1896         /* for filtering, pass "memcg" as argument. */
1897         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1898 }
1899
1900 static void memcg_oom_recover(struct mem_cgroup *memcg)
1901 {
1902         if (memcg && atomic_read(&memcg->under_oom))
1903                 memcg_wakeup_oom(memcg);
1904 }
1905
1906 /*
1907  * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1908  */
1909 bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1910 {
1911         struct oom_wait_info owait;
1912         bool locked, need_to_kill;
1913
1914         owait.mem = memcg;
1915         owait.wait.flags = 0;
1916         owait.wait.func = memcg_oom_wake_function;
1917         owait.wait.private = current;
1918         INIT_LIST_HEAD(&owait.wait.task_list);
1919         need_to_kill = true;
1920         mem_cgroup_mark_under_oom(memcg);
1921
1922         /* At first, try to OOM lock hierarchy under memcg.*/
1923         spin_lock(&memcg_oom_lock);
1924         locked = mem_cgroup_oom_lock(memcg);
1925         /*
1926          * Even if signal_pending(), we can't quit charge() loop without
1927          * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1928          * under OOM is always welcomed, use TASK_KILLABLE here.
1929          */
1930         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1931         if (!locked || memcg->oom_kill_disable)
1932                 need_to_kill = false;
1933         if (locked)
1934                 mem_cgroup_oom_notify(memcg);
1935         spin_unlock(&memcg_oom_lock);
1936
1937         if (need_to_kill) {
1938                 finish_wait(&memcg_oom_waitq, &owait.wait);
1939                 mem_cgroup_out_of_memory(memcg, mask);
1940         } else {
1941                 schedule();
1942                 finish_wait(&memcg_oom_waitq, &owait.wait);
1943         }
1944         spin_lock(&memcg_oom_lock);
1945         if (locked)
1946                 mem_cgroup_oom_unlock(memcg);
1947         memcg_wakeup_oom(memcg);
1948         spin_unlock(&memcg_oom_lock);
1949
1950         mem_cgroup_unmark_under_oom(memcg);
1951
1952         if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1953                 return false;
1954         /* Give chance to dying process */
1955         schedule_timeout_uninterruptible(1);
1956         return true;
1957 }
1958
1959 /*
1960  * Currently used to update mapped file statistics, but the routine can be
1961  * generalized to update other statistics as well.
1962  *
1963  * Notes: Race condition
1964  *
1965  * We usually use page_cgroup_lock() for accessing page_cgroup member but
1966  * it tends to be costly. But considering some conditions, we doesn't need
1967  * to do so _always_.
1968  *
1969  * Considering "charge", lock_page_cgroup() is not required because all
1970  * file-stat operations happen after a page is attached to radix-tree. There
1971  * are no race with "charge".
1972  *
1973  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1974  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1975  * if there are race with "uncharge". Statistics itself is properly handled
1976  * by flags.
1977  *
1978  * Considering "move", this is an only case we see a race. To make the race
1979  * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1980  * possibility of race condition. If there is, we take a lock.
1981  */
1982
1983 void mem_cgroup_update_page_stat(struct page *page,
1984                                  enum mem_cgroup_page_stat_item idx, int val)
1985 {
1986         struct mem_cgroup *memcg;
1987         struct page_cgroup *pc = lookup_page_cgroup(page);
1988         bool need_unlock = false;
1989         unsigned long uninitialized_var(flags);
1990
1991         if (unlikely(!pc))
1992                 return;
1993
1994         rcu_read_lock();
1995         memcg = pc->mem_cgroup;
1996         if (unlikely(!memcg || !PageCgroupUsed(pc)))
1997                 goto out;
1998         /* pc->mem_cgroup is unstable ? */
1999         if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
2000                 /* take a lock against to access pc->mem_cgroup */
2001                 move_lock_page_cgroup(pc, &flags);
2002                 need_unlock = true;
2003                 memcg = pc->mem_cgroup;
2004                 if (!memcg || !PageCgroupUsed(pc))
2005                         goto out;
2006         }
2007
2008         switch (idx) {
2009         case MEMCG_NR_FILE_MAPPED:
2010                 if (val > 0)
2011                         SetPageCgroupFileMapped(pc);
2012                 else if (!page_mapped(page))
2013                         ClearPageCgroupFileMapped(pc);
2014                 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2015                 break;
2016         default:
2017                 BUG();
2018         }
2019
2020         this_cpu_add(memcg->stat->count[idx], val);
2021
2022 out:
2023         if (unlikely(need_unlock))
2024                 move_unlock_page_cgroup(pc, &flags);
2025         rcu_read_unlock();
2026         return;
2027 }
2028 EXPORT_SYMBOL(mem_cgroup_update_page_stat);
2029
2030 /*
2031  * size of first charge trial. "32" comes from vmscan.c's magic value.
2032  * TODO: maybe necessary to use big numbers in big irons.
2033  */
2034 #define CHARGE_BATCH    32U
2035 struct memcg_stock_pcp {
2036         struct mem_cgroup *cached; /* this never be root cgroup */
2037         unsigned int nr_pages;
2038         struct work_struct work;
2039         unsigned long flags;
2040 #define FLUSHING_CACHED_CHARGE  (0)
2041 };
2042 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2043 static DEFINE_MUTEX(percpu_charge_mutex);
2044
2045 /*
2046  * Try to consume stocked charge on this cpu. If success, one page is consumed
2047  * from local stock and true is returned. If the stock is 0 or charges from a
2048  * cgroup which is not current target, returns false. This stock will be
2049  * refilled.
2050  */
2051 static bool consume_stock(struct mem_cgroup *memcg)
2052 {
2053         struct memcg_stock_pcp *stock;
2054         bool ret = true;
2055
2056         stock = &get_cpu_var(memcg_stock);
2057         if (memcg == stock->cached && stock->nr_pages)
2058                 stock->nr_pages--;
2059         else /* need to call res_counter_charge */
2060                 ret = false;
2061         put_cpu_var(memcg_stock);
2062         return ret;
2063 }
2064
2065 /*
2066  * Returns stocks cached in percpu to res_counter and reset cached information.
2067  */
2068 static void drain_stock(struct memcg_stock_pcp *stock)
2069 {
2070         struct mem_cgroup *old = stock->cached;
2071
2072         if (stock->nr_pages) {
2073                 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2074
2075                 res_counter_uncharge(&old->res, bytes);
2076                 if (do_swap_account)
2077                         res_counter_uncharge(&old->memsw, bytes);
2078                 stock->nr_pages = 0;
2079         }
2080         stock->cached = NULL;
2081 }
2082
2083 /*
2084  * This must be called under preempt disabled or must be called by
2085  * a thread which is pinned to local cpu.
2086  */
2087 static void drain_local_stock(struct work_struct *dummy)
2088 {
2089         struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2090         drain_stock(stock);
2091         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2092 }
2093
2094 /*
2095  * Cache charges(val) which is from res_counter, to local per_cpu area.
2096  * This will be consumed by consume_stock() function, later.
2097  */
2098 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2099 {
2100         struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2101
2102         if (stock->cached != memcg) { /* reset if necessary */
2103                 drain_stock(stock);
2104                 stock->cached = memcg;
2105         }
2106         stock->nr_pages += nr_pages;
2107         put_cpu_var(memcg_stock);
2108 }
2109
2110 /*
2111  * Drains all per-CPU charge caches for given root_memcg resp. subtree
2112  * of the hierarchy under it. sync flag says whether we should block
2113  * until the work is done.
2114  */
2115 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2116 {
2117         int cpu, curcpu;
2118
2119         /* Notify other cpus that system-wide "drain" is running */
2120         get_online_cpus();
2121         curcpu = get_cpu();
2122         for_each_online_cpu(cpu) {
2123                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2124                 struct mem_cgroup *memcg;
2125
2126                 memcg = stock->cached;
2127                 if (!memcg || !stock->nr_pages)
2128                         continue;
2129                 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2130                         continue;
2131                 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2132                         if (cpu == curcpu)
2133                                 drain_local_stock(&stock->work);
2134                         else
2135                                 schedule_work_on(cpu, &stock->work);
2136                 }
2137         }
2138         put_cpu();
2139
2140         if (!sync)
2141                 goto out;
2142
2143         for_each_online_cpu(cpu) {
2144                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2145                 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2146                         flush_work(&stock->work);
2147         }
2148 out:
2149         put_online_cpus();
2150 }
2151
2152 /*
2153  * Tries to drain stocked charges in other cpus. This function is asynchronous
2154  * and just put a work per cpu for draining localy on each cpu. Caller can
2155  * expects some charges will be back to res_counter later but cannot wait for
2156  * it.
2157  */
2158 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2159 {
2160         /*
2161          * If someone calls draining, avoid adding more kworker runs.
2162          */
2163         if (!mutex_trylock(&percpu_charge_mutex))
2164                 return;
2165         drain_all_stock(root_memcg, false);
2166         mutex_unlock(&percpu_charge_mutex);
2167 }
2168
2169 /* This is a synchronous drain interface. */
2170 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2171 {
2172         /* called when force_empty is called */
2173         mutex_lock(&percpu_charge_mutex);
2174         drain_all_stock(root_memcg, true);
2175         mutex_unlock(&percpu_charge_mutex);
2176 }
2177
2178 /*
2179  * This function drains percpu counter value from DEAD cpu and
2180  * move it to local cpu. Note that this function can be preempted.
2181  */
2182 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2183 {
2184         int i;
2185
2186         spin_lock(&memcg->pcp_counter_lock);
2187         for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2188                 long x = per_cpu(memcg->stat->count[i], cpu);
2189
2190                 per_cpu(memcg->stat->count[i], cpu) = 0;
2191                 memcg->nocpu_base.count[i] += x;
2192         }
2193         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2194                 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2195
2196                 per_cpu(memcg->stat->events[i], cpu) = 0;
2197                 memcg->nocpu_base.events[i] += x;
2198         }
2199         /* need to clear ON_MOVE value, works as a kind of lock. */
2200         per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2201         spin_unlock(&memcg->pcp_counter_lock);
2202 }
2203
2204 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2205 {
2206         int idx = MEM_CGROUP_ON_MOVE;
2207
2208         spin_lock(&memcg->pcp_counter_lock);
2209         per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2210         spin_unlock(&memcg->pcp_counter_lock);
2211 }
2212
2213 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2214                                         unsigned long action,
2215                                         void *hcpu)
2216 {
2217         int cpu = (unsigned long)hcpu;
2218         struct memcg_stock_pcp *stock;
2219         struct mem_cgroup *iter;
2220
2221         if ((action == CPU_ONLINE)) {
2222                 for_each_mem_cgroup(iter)
2223                         synchronize_mem_cgroup_on_move(iter, cpu);
2224                 return NOTIFY_OK;
2225         }
2226
2227         if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2228                 return NOTIFY_OK;
2229
2230         for_each_mem_cgroup(iter)
2231                 mem_cgroup_drain_pcp_counter(iter, cpu);
2232
2233         stock = &per_cpu(memcg_stock, cpu);
2234         drain_stock(stock);
2235         return NOTIFY_OK;
2236 }
2237
2238
2239 /* See __mem_cgroup_try_charge() for details */
2240 enum {
2241         CHARGE_OK,              /* success */
2242         CHARGE_RETRY,           /* need to retry but retry is not bad */
2243         CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
2244         CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
2245         CHARGE_OOM_DIE,         /* the current is killed because of OOM */
2246 };
2247
2248 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2249                                 unsigned int nr_pages, bool oom_check)
2250 {
2251         unsigned long csize = nr_pages * PAGE_SIZE;
2252         struct mem_cgroup *mem_over_limit;
2253         struct res_counter *fail_res;
2254         unsigned long flags = 0;
2255         int ret;
2256
2257         ret = res_counter_charge(&memcg->res, csize, &fail_res);
2258
2259         if (likely(!ret)) {
2260                 if (!do_swap_account)
2261                         return CHARGE_OK;
2262                 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2263                 if (likely(!ret))
2264                         return CHARGE_OK;
2265
2266                 res_counter_uncharge(&memcg->res, csize);
2267                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2268                 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2269         } else
2270                 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2271         /*
2272          * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2273          * of regular pages (CHARGE_BATCH), or a single regular page (1).
2274          *
2275          * Never reclaim on behalf of optional batching, retry with a
2276          * single page instead.
2277          */
2278         if (nr_pages == CHARGE_BATCH)
2279                 return CHARGE_RETRY;
2280
2281         if (!(gfp_mask & __GFP_WAIT))
2282                 return CHARGE_WOULDBLOCK;
2283
2284         ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
2285                                               gfp_mask, flags, NULL);
2286         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2287                 return CHARGE_RETRY;
2288         /*
2289          * Even though the limit is exceeded at this point, reclaim
2290          * may have been able to free some pages.  Retry the charge
2291          * before killing the task.
2292          *
2293          * Only for regular pages, though: huge pages are rather
2294          * unlikely to succeed so close to the limit, and we fall back
2295          * to regular pages anyway in case of failure.
2296          */
2297         if (nr_pages == 1 && ret)
2298                 return CHARGE_RETRY;
2299
2300         /*
2301          * At task move, charge accounts can be doubly counted. So, it's
2302          * better to wait until the end of task_move if something is going on.
2303          */
2304         if (mem_cgroup_wait_acct_move(mem_over_limit))
2305                 return CHARGE_RETRY;
2306
2307         /* If we don't need to call oom-killer at el, return immediately */
2308         if (!oom_check)
2309                 return CHARGE_NOMEM;
2310         /* check OOM */
2311         if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
2312                 return CHARGE_OOM_DIE;
2313
2314         return CHARGE_RETRY;
2315 }
2316
2317 /*
2318  * Unlike exported interface, "oom" parameter is added. if oom==true,
2319  * oom-killer can be invoked.
2320  */
2321 static int __mem_cgroup_try_charge(struct mm_struct *mm,
2322                                    gfp_t gfp_mask,
2323                                    unsigned int nr_pages,
2324                                    struct mem_cgroup **ptr,
2325                                    bool oom)
2326 {
2327         unsigned int batch = max(CHARGE_BATCH, nr_pages);
2328         int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2329         struct mem_cgroup *memcg = NULL;
2330         int ret;
2331
2332         /*
2333          * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2334          * in system level. So, allow to go ahead dying process in addition to
2335          * MEMDIE process.
2336          */
2337         if (unlikely(test_thread_flag(TIF_MEMDIE)
2338                      || fatal_signal_pending(current)))
2339                 goto bypass;
2340
2341         /*
2342          * We always charge the cgroup the mm_struct belongs to.
2343          * The mm_struct's mem_cgroup changes on task migration if the
2344          * thread group leader migrates. It's possible that mm is not
2345          * set, if so charge the init_mm (happens for pagecache usage).
2346          */
2347         if (!*ptr && !mm)
2348                 goto bypass;
2349 again:
2350         if (*ptr) { /* css should be a valid one */
2351                 memcg = *ptr;
2352                 VM_BUG_ON(css_is_removed(&memcg->css));
2353                 if (mem_cgroup_is_root(memcg))
2354                         goto done;
2355                 if (nr_pages == 1 && consume_stock(memcg))
2356                         goto done;
2357                 css_get(&memcg->css);
2358         } else {
2359                 struct task_struct *p;
2360
2361                 rcu_read_lock();
2362                 p = rcu_dereference(mm->owner);
2363                 /*
2364                  * Because we don't have task_lock(), "p" can exit.
2365                  * In that case, "memcg" can point to root or p can be NULL with
2366                  * race with swapoff. Then, we have small risk of mis-accouning.
2367                  * But such kind of mis-account by race always happens because
2368                  * we don't have cgroup_mutex(). It's overkill and we allo that
2369                  * small race, here.
2370                  * (*) swapoff at el will charge against mm-struct not against
2371                  * task-struct. So, mm->owner can be NULL.
2372                  */
2373                 memcg = mem_cgroup_from_task(p);
2374                 if (!memcg || mem_cgroup_is_root(memcg)) {
2375                         rcu_read_unlock();
2376                         goto done;
2377                 }
2378                 if (nr_pages == 1 && consume_stock(memcg)) {
2379                         /*
2380                          * It seems dagerous to access memcg without css_get().
2381                          * But considering how consume_stok works, it's not
2382                          * necessary. If consume_stock success, some charges
2383                          * from this memcg are cached on this cpu. So, we
2384                          * don't need to call css_get()/css_tryget() before
2385                          * calling consume_stock().
2386                          */
2387                         rcu_read_unlock();
2388                         goto done;
2389                 }
2390                 /* after here, we may be blocked. we need to get refcnt */
2391                 if (!css_tryget(&memcg->css)) {
2392                         rcu_read_unlock();
2393                         goto again;
2394                 }
2395                 rcu_read_unlock();
2396         }
2397
2398         do {
2399                 bool oom_check;
2400
2401                 /* If killed, bypass charge */
2402                 if (fatal_signal_pending(current)) {
2403                         css_put(&memcg->css);
2404                         goto bypass;
2405                 }
2406
2407                 oom_check = false;
2408                 if (oom && !nr_oom_retries) {
2409                         oom_check = true;
2410                         nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2411                 }
2412
2413                 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
2414                 switch (ret) {
2415                 case CHARGE_OK:
2416                         break;
2417                 case CHARGE_RETRY: /* not in OOM situation but retry */
2418                         batch = nr_pages;
2419                         css_put(&memcg->css);
2420                         memcg = NULL;
2421                         goto again;
2422                 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2423                         css_put(&memcg->css);
2424                         goto nomem;
2425                 case CHARGE_NOMEM: /* OOM routine works */
2426                         if (!oom) {
2427                                 css_put(&memcg->css);
2428                                 goto nomem;
2429                         }
2430                         /* If oom, we never return -ENOMEM */
2431                         nr_oom_retries--;
2432                         break;
2433                 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2434                         css_put(&memcg->css);
2435                         goto bypass;
2436                 }
2437         } while (ret != CHARGE_OK);
2438
2439         if (batch > nr_pages)
2440                 refill_stock(memcg, batch - nr_pages);
2441         css_put(&memcg->css);
2442 done:
2443         *ptr = memcg;
2444         return 0;
2445 nomem:
2446         *ptr = NULL;
2447         return -ENOMEM;
2448 bypass:
2449         *ptr = NULL;
2450         return 0;
2451 }
2452
2453 /*
2454  * Somemtimes we have to undo a charge we got by try_charge().
2455  * This function is for that and do uncharge, put css's refcnt.
2456  * gotten by try_charge().
2457  */
2458 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2459                                        unsigned int nr_pages)
2460 {
2461         if (!mem_cgroup_is_root(memcg)) {
2462                 unsigned long bytes = nr_pages * PAGE_SIZE;
2463
2464                 res_counter_uncharge(&memcg->res, bytes);
2465                 if (do_swap_account)
2466                         res_counter_uncharge(&memcg->memsw, bytes);
2467         }
2468 }
2469
2470 /*
2471  * A helper function to get mem_cgroup from ID. must be called under
2472  * rcu_read_lock(). The caller must check css_is_removed() or some if
2473  * it's concern. (dropping refcnt from swap can be called against removed
2474  * memcg.)
2475  */
2476 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2477 {
2478         struct cgroup_subsys_state *css;
2479
2480         /* ID 0 is unused ID */
2481         if (!id)
2482                 return NULL;
2483         css = css_lookup(&mem_cgroup_subsys, id);
2484         if (!css)
2485                 return NULL;
2486         return container_of(css, struct mem_cgroup, css);
2487 }
2488
2489 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2490 {
2491         struct mem_cgroup *memcg = NULL;
2492         struct page_cgroup *pc;
2493         unsigned short id;
2494         swp_entry_t ent;
2495
2496         VM_BUG_ON(!PageLocked(page));
2497
2498         pc = lookup_page_cgroup(page);
2499         lock_page_cgroup(pc);
2500         if (PageCgroupUsed(pc)) {
2501                 memcg = pc->mem_cgroup;
2502                 if (memcg && !css_tryget(&memcg->css))
2503                         memcg = NULL;
2504         } else if (PageSwapCache(page)) {
2505                 ent.val = page_private(page);
2506                 id = lookup_swap_cgroup(ent);
2507                 rcu_read_lock();
2508                 memcg = mem_cgroup_lookup(id);
2509                 if (memcg && !css_tryget(&memcg->css))
2510                         memcg = NULL;
2511                 rcu_read_unlock();
2512         }
2513         unlock_page_cgroup(pc);
2514         return memcg;
2515 }
2516
2517 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2518                                        struct page *page,
2519                                        unsigned int nr_pages,
2520                                        struct page_cgroup *pc,
2521                                        enum charge_type ctype)
2522 {
2523         lock_page_cgroup(pc);
2524         if (unlikely(PageCgroupUsed(pc))) {
2525                 unlock_page_cgroup(pc);
2526                 __mem_cgroup_cancel_charge(memcg, nr_pages);
2527                 return;
2528         }
2529         /*
2530          * we don't need page_cgroup_lock about tail pages, becase they are not
2531          * accessed by any other context at this point.
2532          */
2533         pc->mem_cgroup = memcg;
2534         /*
2535          * We access a page_cgroup asynchronously without lock_page_cgroup().
2536          * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2537          * is accessed after testing USED bit. To make pc->mem_cgroup visible
2538          * before USED bit, we need memory barrier here.
2539          * See mem_cgroup_add_lru_list(), etc.
2540          */
2541         smp_wmb();
2542         switch (ctype) {
2543         case MEM_CGROUP_CHARGE_TYPE_CACHE:
2544         case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2545                 SetPageCgroupCache(pc);
2546                 SetPageCgroupUsed(pc);
2547                 break;
2548         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2549                 ClearPageCgroupCache(pc);
2550                 SetPageCgroupUsed(pc);
2551                 break;
2552         default:
2553                 break;
2554         }
2555
2556         mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2557         unlock_page_cgroup(pc);
2558         /*
2559          * "charge_statistics" updated event counter. Then, check it.
2560          * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2561          * if they exceeds softlimit.
2562          */
2563         memcg_check_events(memcg, page);
2564 }
2565
2566 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2567
2568 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2569                         (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2570 /*
2571  * Because tail pages are not marked as "used", set it. We're under
2572  * zone->lru_lock, 'splitting on pmd' and compund_lock.
2573  */
2574 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2575 {
2576         struct page_cgroup *head_pc = lookup_page_cgroup(head);
2577         struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2578         unsigned long flags;
2579
2580         if (mem_cgroup_disabled())
2581                 return;
2582         /*
2583          * We have no races with charge/uncharge but will have races with
2584          * page state accounting.
2585          */
2586         move_lock_page_cgroup(head_pc, &flags);
2587
2588         tail_pc->mem_cgroup = head_pc->mem_cgroup;
2589         smp_wmb(); /* see __commit_charge() */
2590         if (PageCgroupAcctLRU(head_pc)) {
2591                 enum lru_list lru;
2592                 struct mem_cgroup_per_zone *mz;
2593
2594                 /*
2595                  * LRU flags cannot be copied because we need to add tail
2596                  *.page to LRU by generic call and our hook will be called.
2597                  * We hold lru_lock, then, reduce counter directly.
2598                  */
2599                 lru = page_lru(head);
2600                 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2601                 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2602         }
2603         tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2604         move_unlock_page_cgroup(head_pc, &flags);
2605 }
2606 #endif
2607
2608 /**
2609  * mem_cgroup_move_account - move account of the page
2610  * @page: the page
2611  * @nr_pages: number of regular pages (>1 for huge pages)
2612  * @pc: page_cgroup of the page.
2613  * @from: mem_cgroup which the page is moved from.
2614  * @to: mem_cgroup which the page is moved to. @from != @to.
2615  * @uncharge: whether we should call uncharge and css_put against @from.
2616  *
2617  * The caller must confirm following.
2618  * - page is not on LRU (isolate_page() is useful.)
2619  * - compound_lock is held when nr_pages > 1
2620  *
2621  * This function doesn't do "charge" nor css_get to new cgroup. It should be
2622  * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2623  * true, this function does "uncharge" from old cgroup, but it doesn't if
2624  * @uncharge is false, so a caller should do "uncharge".
2625  */
2626 static int mem_cgroup_move_account(struct page *page,
2627                                    unsigned int nr_pages,
2628                                    struct page_cgroup *pc,
2629                                    struct mem_cgroup *from,
2630                                    struct mem_cgroup *to,
2631                                    bool uncharge)
2632 {
2633         unsigned long flags;
2634         int ret;
2635
2636         VM_BUG_ON(from == to);
2637         VM_BUG_ON(PageLRU(page));
2638         /*
2639          * The page is isolated from LRU. So, collapse function
2640          * will not handle this page. But page splitting can happen.
2641          * Do this check under compound_page_lock(). The caller should
2642          * hold it.
2643          */
2644         ret = -EBUSY;
2645         if (nr_pages > 1 && !PageTransHuge(page))
2646                 goto out;
2647
2648         lock_page_cgroup(pc);
2649
2650         ret = -EINVAL;
2651         if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2652                 goto unlock;
2653
2654         move_lock_page_cgroup(pc, &flags);
2655
2656         if (PageCgroupFileMapped(pc)) {
2657                 /* Update mapped_file data for mem_cgroup */
2658                 preempt_disable();
2659                 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2660                 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2661                 preempt_enable();
2662         }
2663         mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2664         if (uncharge)
2665                 /* This is not "cancel", but cancel_charge does all we need. */
2666                 __mem_cgroup_cancel_charge(from, nr_pages);
2667
2668         /* caller should have done css_get */
2669         pc->mem_cgroup = to;
2670         mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2671         /*
2672          * We charges against "to" which may not have any tasks. Then, "to"
2673          * can be under rmdir(). But in current implementation, caller of
2674          * this function is just force_empty() and move charge, so it's
2675          * guaranteed that "to" is never removed. So, we don't check rmdir
2676          * status here.
2677          */
2678         move_unlock_page_cgroup(pc, &flags);
2679         ret = 0;
2680 unlock:
2681         unlock_page_cgroup(pc);
2682         /*
2683          * check events
2684          */
2685         memcg_check_events(to, page);
2686         memcg_check_events(from, page);
2687 out:
2688         return ret;
2689 }
2690
2691 /*
2692  * move charges to its parent.
2693  */
2694
2695 static int mem_cgroup_move_parent(struct page *page,
2696                                   struct page_cgroup *pc,
2697                                   struct mem_cgroup *child,
2698                                   gfp_t gfp_mask)
2699 {
2700         struct cgroup *cg = child->css.cgroup;
2701         struct cgroup *pcg = cg->parent;
2702         struct mem_cgroup *parent;
2703         unsigned int nr_pages;
2704         unsigned long uninitialized_var(flags);
2705         int ret;
2706
2707         /* Is ROOT ? */
2708         if (!pcg)
2709                 return -EINVAL;
2710
2711         ret = -EBUSY;
2712         if (!get_page_unless_zero(page))
2713                 goto out;
2714         if (isolate_lru_page(page))
2715                 goto put;
2716
2717         nr_pages = hpage_nr_pages(page);
2718
2719         parent = mem_cgroup_from_cont(pcg);
2720         ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2721         if (ret || !parent)
2722                 goto put_back;
2723
2724         if (nr_pages > 1)
2725                 flags = compound_lock_irqsave(page);
2726
2727         ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2728         if (ret)
2729                 __mem_cgroup_cancel_charge(parent, nr_pages);
2730
2731         if (nr_pages > 1)
2732                 compound_unlock_irqrestore(page, flags);
2733 put_back:
2734         putback_lru_page(page);
2735 put:
2736         put_page(page);
2737 out:
2738         return ret;
2739 }
2740
2741 /*
2742  * Charge the memory controller for page usage.
2743  * Return
2744  * 0 if the charge was successful
2745  * < 0 if the cgroup is over its limit
2746  */
2747 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2748                                 gfp_t gfp_mask, enum charge_type ctype)
2749 {
2750         struct mem_cgroup *memcg = NULL;
2751         unsigned int nr_pages = 1;
2752         struct page_cgroup *pc;
2753         bool oom = true;
2754         int ret;
2755
2756         if (PageTransHuge(page)) {
2757                 nr_pages <<= compound_order(page);
2758                 VM_BUG_ON(!PageTransHuge(page));
2759                 /*
2760                  * Never OOM-kill a process for a huge page.  The
2761                  * fault handler will fall back to regular pages.
2762                  */
2763                 oom = false;
2764         }
2765
2766         pc = lookup_page_cgroup(page);
2767         BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2768
2769         ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2770         if (ret || !memcg)
2771                 return ret;
2772
2773         __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2774         return 0;
2775 }
2776
2777 int mem_cgroup_newpage_charge(struct page *page,
2778                               struct mm_struct *mm, gfp_t gfp_mask)
2779 {
2780         if (mem_cgroup_disabled())
2781                 return 0;
2782         /*
2783          * If already mapped, we don't have to account.
2784          * If page cache, page->mapping has address_space.
2785          * But page->mapping may have out-of-use anon_vma pointer,
2786          * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2787          * is NULL.
2788          */
2789         if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2790                 return 0;
2791         if (unlikely(!mm))
2792                 mm = &init_mm;
2793         return mem_cgroup_charge_common(page, mm, gfp_mask,
2794                                 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2795 }
2796
2797 static void
2798 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2799                                         enum charge_type ctype);
2800
2801 static void
2802 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2803                                         enum charge_type ctype)
2804 {
2805         struct page_cgroup *pc = lookup_page_cgroup(page);
2806         /*
2807          * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2808          * is already on LRU. It means the page may on some other page_cgroup's
2809          * LRU. Take care of it.
2810          */
2811         mem_cgroup_lru_del_before_commit(page);
2812         __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2813         mem_cgroup_lru_add_after_commit(page);
2814         return;
2815 }
2816
2817 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2818                                 gfp_t gfp_mask)
2819 {
2820         struct mem_cgroup *memcg = NULL;
2821         int ret;
2822
2823         if (mem_cgroup_disabled())
2824                 return 0;
2825         if (PageCompound(page))
2826                 return 0;
2827
2828         if (unlikely(!mm))
2829                 mm = &init_mm;
2830
2831         if (page_is_file_cache(page)) {
2832                 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
2833                 if (ret || !memcg)
2834                         return ret;
2835
2836                 /*
2837                  * FUSE reuses pages without going through the final
2838                  * put that would remove them from the LRU list, make
2839                  * sure that they get relinked properly.
2840                  */
2841                 __mem_cgroup_commit_charge_lrucare(page, memcg,
2842                                         MEM_CGROUP_CHARGE_TYPE_CACHE);
2843                 return ret;
2844         }
2845         /* shmem */
2846         if (PageSwapCache(page)) {
2847                 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2848                 if (!ret)
2849                         __mem_cgroup_commit_charge_swapin(page, memcg,
2850                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
2851         } else
2852                 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2853                                         MEM_CGROUP_CHARGE_TYPE_SHMEM);
2854
2855         return ret;
2856 }
2857
2858 /*
2859  * While swap-in, try_charge -> commit or cancel, the page is locked.
2860  * And when try_charge() successfully returns, one refcnt to memcg without
2861  * struct page_cgroup is acquired. This refcnt will be consumed by
2862  * "commit()" or removed by "cancel()"
2863  */
2864 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2865                                  struct page *page,
2866                                  gfp_t mask, struct mem_cgroup **ptr)
2867 {
2868         struct mem_cgroup *memcg;
2869         int ret;
2870
2871         *ptr = NULL;
2872
2873         if (mem_cgroup_disabled())
2874                 return 0;
2875
2876         if (!do_swap_account)
2877                 goto charge_cur_mm;
2878         /*
2879          * A racing thread's fault, or swapoff, may have already updated
2880          * the pte, and even removed page from swap cache: in those cases
2881          * do_swap_page()'s pte_same() test will fail; but there's also a
2882          * KSM case which does need to charge the page.
2883          */
2884         if (!PageSwapCache(page))
2885                 goto charge_cur_mm;
2886         memcg = try_get_mem_cgroup_from_page(page);
2887         if (!memcg)
2888                 goto charge_cur_mm;
2889         *ptr = memcg;
2890         ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2891         css_put(&memcg->css);
2892         return ret;
2893 charge_cur_mm:
2894         if (unlikely(!mm))
2895                 mm = &init_mm;
2896         return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2897 }
2898
2899 static void
2900 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2901                                         enum charge_type ctype)
2902 {
2903         if (mem_cgroup_disabled())
2904                 return;
2905         if (!ptr)
2906                 return;
2907         cgroup_exclude_rmdir(&ptr->css);
2908
2909         __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2910         /*
2911          * Now swap is on-memory. This means this page may be
2912          * counted both as mem and swap....double count.
2913          * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2914          * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2915          * may call delete_from_swap_cache() before reach here.
2916          */
2917         if (do_swap_account && PageSwapCache(page)) {
2918                 swp_entry_t ent = {.val = page_private(page)};
2919                 unsigned short id;
2920                 struct mem_cgroup *memcg;
2921
2922                 id = swap_cgroup_record(ent, 0);
2923                 rcu_read_lock();
2924                 memcg = mem_cgroup_lookup(id);
2925                 if (memcg) {
2926                         /*
2927                          * This recorded memcg can be obsolete one. So, avoid
2928                          * calling css_tryget
2929                          */
2930                         if (!mem_cgroup_is_root(memcg))
2931                                 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2932                         mem_cgroup_swap_statistics(memcg, false);
2933                         mem_cgroup_put(memcg);
2934                 }
2935                 rcu_read_unlock();
2936         }
2937         /*
2938          * At swapin, we may charge account against cgroup which has no tasks.
2939          * So, rmdir()->pre_destroy() can be called while we do this charge.
2940          * In that case, we need to call pre_destroy() again. check it here.
2941          */
2942         cgroup_release_and_wakeup_rmdir(&ptr->css);
2943 }
2944
2945 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2946 {
2947         __mem_cgroup_commit_charge_swapin(page, ptr,
2948                                         MEM_CGROUP_CHARGE_TYPE_MAPPED);
2949 }
2950
2951 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
2952 {
2953         if (mem_cgroup_disabled())
2954                 return;
2955         if (!memcg)
2956                 return;
2957         __mem_cgroup_cancel_charge(memcg, 1);
2958 }
2959
2960 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
2961                                    unsigned int nr_pages,
2962                                    const enum charge_type ctype)
2963 {
2964         struct memcg_batch_info *batch = NULL;
2965         bool uncharge_memsw = true;
2966
2967         /* If swapout, usage of swap doesn't decrease */
2968         if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2969                 uncharge_memsw = false;
2970
2971         batch = &current->memcg_batch;
2972         /*
2973          * In usual, we do css_get() when we remember memcg pointer.
2974          * But in this case, we keep res->usage until end of a series of
2975          * uncharges. Then, it's ok to ignore memcg's refcnt.
2976          */
2977         if (!batch->memcg)
2978                 batch->memcg = memcg;
2979         /*
2980          * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2981          * In those cases, all pages freed continuously can be expected to be in
2982          * the same cgroup and we have chance to coalesce uncharges.
2983          * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2984          * because we want to do uncharge as soon as possible.
2985          */
2986
2987         if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2988                 goto direct_uncharge;
2989
2990         if (nr_pages > 1)
2991                 goto direct_uncharge;
2992
2993         /*
2994          * In typical case, batch->memcg == mem. This means we can
2995          * merge a series of uncharges to an uncharge of res_counter.
2996          * If not, we uncharge res_counter ony by one.
2997          */
2998         if (batch->memcg != memcg)
2999                 goto direct_uncharge;
3000         /* remember freed charge and uncharge it later */
3001         batch->nr_pages++;
3002         if (uncharge_memsw)
3003                 batch->memsw_nr_pages++;
3004         return;
3005 direct_uncharge:
3006         res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
3007         if (uncharge_memsw)
3008                 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
3009         if (unlikely(batch->memcg != memcg))
3010                 memcg_oom_recover(memcg);
3011         return;
3012 }
3013
3014 /*
3015  * uncharge if !page_mapped(page)
3016  */
3017 static struct mem_cgroup *
3018 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3019 {
3020         struct mem_cgroup *memcg = NULL;
3021         unsigned int nr_pages = 1;
3022         struct page_cgroup *pc;
3023
3024         if (mem_cgroup_disabled())
3025                 return NULL;
3026
3027         if (PageSwapCache(page))
3028                 return NULL;
3029
3030         if (PageTransHuge(page)) {
3031                 nr_pages <<= compound_order(page);
3032                 VM_BUG_ON(!PageTransHuge(page));
3033         }
3034         /*
3035          * Check if our page_cgroup is valid
3036          */
3037         pc = lookup_page_cgroup(page);
3038         if (unlikely(!pc || !PageCgroupUsed(pc)))
3039                 return NULL;
3040
3041         lock_page_cgroup(pc);
3042
3043         memcg = pc->mem_cgroup;
3044
3045         if (!PageCgroupUsed(pc))
3046                 goto unlock_out;
3047
3048         switch (ctype) {
3049         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
3050         case MEM_CGROUP_CHARGE_TYPE_DROP:
3051                 /* See mem_cgroup_prepare_migration() */
3052                 if (page_mapped(page) || PageCgroupMigration(pc))
3053                         goto unlock_out;
3054                 break;
3055         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3056                 if (!PageAnon(page)) {  /* Shared memory */
3057                         if (page->mapping && !page_is_file_cache(page))
3058                                 goto unlock_out;
3059                 } else if (page_mapped(page)) /* Anon */
3060                                 goto unlock_out;
3061                 break;
3062         default:
3063                 break;
3064         }
3065
3066         mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
3067
3068         ClearPageCgroupUsed(pc);
3069         /*
3070          * pc->mem_cgroup is not cleared here. It will be accessed when it's
3071          * freed from LRU. This is safe because uncharged page is expected not
3072          * to be reused (freed soon). Exception is SwapCache, it's handled by
3073          * special functions.
3074          */
3075
3076         unlock_page_cgroup(pc);
3077         /*
3078          * even after unlock, we have memcg->res.usage here and this memcg
3079          * will never be freed.
3080          */
3081         memcg_check_events(memcg, page);
3082         if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3083                 mem_cgroup_swap_statistics(memcg, true);
3084                 mem_cgroup_get(memcg);
3085         }
3086         if (!mem_cgroup_is_root(memcg))
3087                 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
3088
3089         return memcg;
3090
3091 unlock_out:
3092         unlock_page_cgroup(pc);
3093         return NULL;
3094 }
3095
3096 void mem_cgroup_uncharge_page(struct page *page)
3097 {
3098         /* early check. */
3099         if (page_mapped(page))
3100                 return;
3101         if (page->mapping && !PageAnon(page))
3102                 return;
3103         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3104 }
3105
3106 void mem_cgroup_uncharge_cache_page(struct page *page)
3107 {
3108         VM_BUG_ON(page_mapped(page));
3109         VM_BUG_ON(page->mapping);
3110         __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3111 }
3112
3113 /*
3114  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
3115  * In that cases, pages are freed continuously and we can expect pages
3116  * are in the same memcg. All these calls itself limits the number of
3117  * pages freed at once, then uncharge_start/end() is called properly.
3118  * This may be called prural(2) times in a context,
3119  */
3120
3121 void mem_cgroup_uncharge_start(void)
3122 {
3123         current->memcg_batch.do_batch++;
3124         /* We can do nest. */
3125         if (current->memcg_batch.do_batch == 1) {
3126                 current->memcg_batch.memcg = NULL;
3127                 current->memcg_batch.nr_pages = 0;
3128                 current->memcg_batch.memsw_nr_pages = 0;
3129         }
3130 }
3131
3132 void mem_cgroup_uncharge_end(void)
3133 {
3134         struct memcg_batch_info *batch = &current->memcg_batch;
3135
3136         if (!batch->do_batch)
3137                 return;
3138
3139         batch->do_batch--;
3140         if (batch->do_batch) /* If stacked, do nothing. */
3141                 return;
3142
3143         if (!batch->memcg)
3144                 return;
3145         /*
3146          * This "batch->memcg" is valid without any css_get/put etc...
3147          * bacause we hide charges behind us.
3148          */
3149         if (batch->nr_pages)
3150                 res_counter_uncharge(&batch->memcg->res,
3151                                      batch->nr_pages * PAGE_SIZE);
3152         if (batch->memsw_nr_pages)
3153                 res_counter_uncharge(&batch->memcg->memsw,
3154                                      batch->memsw_nr_pages * PAGE_SIZE);
3155         memcg_oom_recover(batch->memcg);
3156         /* forget this pointer (for sanity check) */
3157         batch->memcg = NULL;
3158 }
3159
3160 #ifdef CONFIG_SWAP
3161 /*
3162  * called after __delete_from_swap_cache() and drop "page" account.
3163  * memcg information is recorded to swap_cgroup of "ent"
3164  */
3165 void
3166 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3167 {
3168         struct mem_cgroup *memcg;
3169         int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3170
3171         if (!swapout) /* this was a swap cache but the swap is unused ! */
3172                 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3173
3174         memcg = __mem_cgroup_uncharge_common(page, ctype);
3175
3176         /*
3177          * record memcg information,  if swapout && memcg != NULL,
3178          * mem_cgroup_get() was called in uncharge().
3179          */
3180         if (do_swap_account && swapout && memcg)
3181                 swap_cgroup_record(ent, css_id(&memcg->css));
3182 }
3183 #endif
3184
3185 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3186 /*
3187  * called from swap_entry_free(). remove record in swap_cgroup and
3188  * uncharge "memsw" account.
3189  */
3190 void mem_cgroup_uncharge_swap(swp_entry_t ent)
3191 {
3192         struct mem_cgroup *memcg;
3193         unsigned short id;
3194
3195         if (!do_swap_account)
3196                 return;
3197
3198         id = swap_cgroup_record(ent, 0);
3199         rcu_read_lock();
3200         memcg = mem_cgroup_lookup(id);
3201         if (memcg) {
3202                 /*
3203                  * We uncharge this because swap is freed.
3204                  * This memcg can be obsolete one. We avoid calling css_tryget
3205                  */
3206                 if (!mem_cgroup_is_root(memcg))
3207                         res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3208                 mem_cgroup_swap_statistics(memcg, false);
3209                 mem_cgroup_put(memcg);
3210         }
3211         rcu_read_unlock();
3212 }
3213
3214 /**
3215  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3216  * @entry: swap entry to be moved
3217  * @from:  mem_cgroup which the entry is moved from
3218  * @to:  mem_cgroup which the entry is moved to
3219  * @need_fixup: whether we should fixup res_counters and refcounts.
3220  *
3221  * It succeeds only when the swap_cgroup's record for this entry is the same
3222  * as the mem_cgroup's id of @from.
3223  *
3224  * Returns 0 on success, -EINVAL on failure.
3225  *
3226  * The caller must have charged to @to, IOW, called res_counter_charge() about
3227  * both res and memsw, and called css_get().
3228  */
3229 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3230                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3231 {
3232         unsigned short old_id, new_id;
3233
3234         old_id = css_id(&from->css);
3235         new_id = css_id(&to->css);
3236
3237         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3238                 mem_cgroup_swap_statistics(from, false);
3239                 mem_cgroup_swap_statistics(to, true);
3240                 /*
3241                  * This function is only called from task migration context now.
3242                  * It postpones res_counter and refcount handling till the end
3243                  * of task migration(mem_cgroup_clear_mc()) for performance
3244                  * improvement. But we cannot postpone mem_cgroup_get(to)
3245                  * because if the process that has been moved to @to does
3246                  * swap-in, the refcount of @to might be decreased to 0.
3247                  */
3248                 mem_cgroup_get(to);
3249                 if (need_fixup) {
3250                         if (!mem_cgroup_is_root(from))
3251                                 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3252                         mem_cgroup_put(from);
3253                         /*
3254                          * we charged both to->res and to->memsw, so we should
3255                          * uncharge to->res.
3256                          */
3257                         if (!mem_cgroup_is_root(to))
3258                                 res_counter_uncharge(&to->res, PAGE_SIZE);
3259                 }
3260                 return 0;
3261         }
3262         return -EINVAL;
3263 }
3264 #else
3265 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3266                 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3267 {
3268         return -EINVAL;
3269 }
3270 #endif
3271
3272 /*
3273  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3274  * page belongs to.
3275  */
3276 int mem_cgroup_prepare_migration(struct page *page,
3277         struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3278 {
3279         struct mem_cgroup *memcg = NULL;
3280         struct page_cgroup *pc;
3281         enum charge_type ctype;
3282         int ret = 0;
3283
3284         *ptr = NULL;
3285
3286         VM_BUG_ON(PageTransHuge(page));
3287         if (mem_cgroup_disabled())
3288                 return 0;
3289
3290         pc = lookup_page_cgroup(page);
3291         lock_page_cgroup(pc);
3292         if (PageCgroupUsed(pc)) {
3293                 memcg = pc->mem_cgroup;
3294                 css_get(&memcg->css);
3295                 /*
3296                  * At migrating an anonymous page, its mapcount goes down
3297                  * to 0 and uncharge() will be called. But, even if it's fully
3298                  * unmapped, migration may fail and this page has to be
3299                  * charged again. We set MIGRATION flag here and delay uncharge
3300                  * until end_migration() is called
3301                  *
3302                  * Corner Case Thinking
3303                  * A)
3304                  * When the old page was mapped as Anon and it's unmap-and-freed
3305                  * while migration was ongoing.
3306                  * If unmap finds the old page, uncharge() of it will be delayed
3307                  * until end_migration(). If unmap finds a new page, it's
3308                  * uncharged when it make mapcount to be 1->0. If unmap code
3309                  * finds swap_migration_entry, the new page will not be mapped
3310                  * and end_migration() will find it(mapcount==0).
3311                  *
3312                  * B)
3313                  * When the old page was mapped but migraion fails, the kernel
3314                  * remaps it. A charge for it is kept by MIGRATION flag even
3315                  * if mapcount goes down to 0. We can do remap successfully
3316                  * without charging it again.
3317                  *
3318                  * C)
3319                  * The "old" page is under lock_page() until the end of
3320                  * migration, so, the old page itself will not be swapped-out.
3321                  * If the new page is swapped out before end_migraton, our
3322                  * hook to usual swap-out path will catch the event.
3323                  */
3324                 if (PageAnon(page))
3325                         SetPageCgroupMigration(pc);
3326         }
3327         unlock_page_cgroup(pc);
3328         /*
3329          * If the page is not charged at this point,
3330          * we return here.
3331          */
3332         if (!memcg)
3333                 return 0;
3334
3335         *ptr = memcg;
3336         ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3337         css_put(&memcg->css);/* drop extra refcnt */
3338         if (ret || *ptr == NULL) {
3339                 if (PageAnon(page)) {
3340                         lock_page_cgroup(pc);
3341                         ClearPageCgroupMigration(pc);
3342                         unlock_page_cgroup(pc);
3343                         /*
3344                          * The old page may be fully unmapped while we kept it.
3345                          */
3346                         mem_cgroup_uncharge_page(page);
3347                 }
3348                 return -ENOMEM;
3349         }
3350         /*
3351          * We charge new page before it's used/mapped. So, even if unlock_page()
3352          * is called before end_migration, we can catch all events on this new
3353          * page. In the case new page is migrated but not remapped, new page's
3354          * mapcount will be finally 0 and we call uncharge in end_migration().
3355          */
3356         pc = lookup_page_cgroup(newpage);
3357         if (PageAnon(page))
3358                 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3359         else if (page_is_file_cache(page))
3360                 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3361         else
3362                 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3363         __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
3364         return ret;
3365 }
3366
3367 /* remove redundant charge if migration failed*/
3368 void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3369         struct page *oldpage, struct page *newpage, bool migration_ok)
3370 {
3371         struct page *used, *unused;
3372         struct page_cgroup *pc;
3373
3374         if (!memcg)
3375                 return;
3376         /* blocks rmdir() */
3377         cgroup_exclude_rmdir(&memcg->css);
3378         if (!migration_ok) {
3379                 used = oldpage;
3380                 unused = newpage;
3381         } else {
3382                 used = newpage;
3383                 unused = oldpage;
3384         }
3385         /*
3386          * We disallowed uncharge of pages under migration because mapcount
3387          * of the page goes down to zero, temporarly.
3388          * Clear the flag and check the page should be charged.
3389          */
3390         pc = lookup_page_cgroup(oldpage);
3391         lock_page_cgroup(pc);
3392         ClearPageCgroupMigration(pc);
3393         unlock_page_cgroup(pc);
3394
3395         __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3396
3397         /*
3398          * If a page is a file cache, radix-tree replacement is very atomic
3399          * and we can skip this check. When it was an Anon page, its mapcount
3400          * goes down to 0. But because we added MIGRATION flage, it's not
3401          * uncharged yet. There are several case but page->mapcount check
3402          * and USED bit check in mem_cgroup_uncharge_page() will do enough
3403          * check. (see prepare_charge() also)
3404          */
3405         if (PageAnon(used))
3406                 mem_cgroup_uncharge_page(used);
3407         /*
3408          * At migration, we may charge account against cgroup which has no
3409          * tasks.
3410          * So, rmdir()->pre_destroy() can be called while we do this charge.
3411          * In that case, we need to call pre_destroy() again. check it here.
3412          */
3413         cgroup_release_and_wakeup_rmdir(&memcg->css);
3414 }
3415
3416 /*
3417  * At replace page cache, newpage is not under any memcg but it's on
3418  * LRU. So, this function doesn't touch res_counter but handles LRU
3419  * in correct way. Both pages are locked so we cannot race with uncharge.
3420  */
3421 void mem_cgroup_replace_page_cache(struct page *oldpage,
3422                                   struct page *newpage)
3423 {
3424         struct mem_cgroup *memcg;
3425         struct page_cgroup *pc;
3426         struct zone *zone;
3427         enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3428         unsigned long flags;
3429
3430         if (mem_cgroup_disabled())
3431                 return;
3432
3433         pc = lookup_page_cgroup(oldpage);
3434         /* fix accounting on old pages */
3435         lock_page_cgroup(pc);
3436         memcg = pc->mem_cgroup;
3437         mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3438         ClearPageCgroupUsed(pc);
3439         unlock_page_cgroup(pc);
3440
3441         if (PageSwapBacked(oldpage))
3442                 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3443
3444         zone = page_zone(newpage);
3445         pc = lookup_page_cgroup(newpage);
3446         /*
3447          * Even if newpage->mapping was NULL before starting replacement,
3448          * the newpage may be on LRU(or pagevec for LRU) already. We lock
3449          * LRU while we overwrite pc->mem_cgroup.
3450          */
3451         spin_lock_irqsave(&zone->lru_lock, flags);
3452         if (PageLRU(newpage))
3453                 del_page_from_lru_list(zone, newpage, page_lru(newpage));
3454         __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
3455         if (PageLRU(newpage))
3456                 add_page_to_lru_list(zone, newpage, page_lru(newpage));
3457         spin_unlock_irqrestore(&zone->lru_lock, flags);
3458 }
3459
3460 #ifdef CONFIG_DEBUG_VM
3461 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3462 {
3463         struct page_cgroup *pc;
3464
3465         pc = lookup_page_cgroup(page);
3466         if (likely(pc) && PageCgroupUsed(pc))
3467                 return pc;
3468         return NULL;
3469 }
3470
3471 bool mem_cgroup_bad_page_check(struct page *page)
3472 {
3473         if (mem_cgroup_disabled())
3474                 return false;
3475
3476         return lookup_page_cgroup_used(page) != NULL;
3477 }
3478
3479 void mem_cgroup_print_bad_page(struct page *page)
3480 {
3481         struct page_cgroup *pc;
3482
3483         pc = lookup_page_cgroup_used(page);
3484         if (pc) {
3485                 int ret = -1;
3486                 char *path;
3487
3488                 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3489                        pc, pc->flags, pc->mem_cgroup);
3490
3491                 path = kmalloc(PATH_MAX, GFP_KERNEL);
3492                 if (path) {
3493                         rcu_read_lock();
3494                         ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3495                                                         path, PATH_MAX);
3496                         rcu_read_unlock();
3497                 }
3498
3499                 printk(KERN_CONT "(%s)\n",
3500                                 (ret < 0) ? "cannot get the path" : path);
3501                 kfree(path);
3502         }
3503 }
3504 #endif
3505
3506 static DEFINE_MUTEX(set_limit_mutex);
3507
3508 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3509                                 unsigned long long val)
3510 {
3511         int retry_count;
3512         u64 memswlimit, memlimit;
3513         int ret = 0;
3514         int children = mem_cgroup_count_children(memcg);
3515         u64 curusage, oldusage;
3516         int enlarge;
3517
3518         /*
3519          * For keeping hierarchical_reclaim simple, how long we should retry
3520          * is depends on callers. We set our retry-count to be function
3521          * of # of children which we should visit in this loop.
3522          */
3523         retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3524
3525         oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3526
3527         enlarge = 0;
3528         while (retry_count) {
3529                 if (signal_pending(current)) {
3530                         ret = -EINTR;
3531                         break;
3532                 }
3533                 /*
3534                  * Rather than hide all in some function, I do this in
3535                  * open coded manner. You see what this really does.
3536                  * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3537                  */
3538                 mutex_lock(&set_limit_mutex);
3539                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3540                 if (memswlimit < val) {
3541                         ret = -EINVAL;
3542                         mutex_unlock(&set_limit_mutex);
3543                         break;
3544                 }
3545
3546                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3547                 if (memlimit < val)
3548                         enlarge = 1;
3549
3550                 ret = res_counter_set_limit(&memcg->res, val);
3551                 if (!ret) {
3552                         if (memswlimit == val)
3553                                 memcg->memsw_is_minimum = true;
3554                         else
3555                                 memcg->memsw_is_minimum = false;
3556                 }
3557                 mutex_unlock(&set_limit_mutex);
3558
3559                 if (!ret)
3560                         break;
3561
3562                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3563                                                 MEM_CGROUP_RECLAIM_SHRINK,
3564                                                 NULL);
3565                 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3566                 /* Usage is reduced ? */
3567                 if (curusage >= oldusage)
3568                         retry_count--;
3569                 else
3570                         oldusage = curusage;
3571         }
3572         if (!ret && enlarge)
3573                 memcg_oom_recover(memcg);
3574
3575         return ret;
3576 }
3577
3578 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3579                                         unsigned long long val)
3580 {
3581         int retry_count;
3582         u64 memlimit, memswlimit, oldusage, curusage;
3583         int children = mem_cgroup_count_children(memcg);
3584         int ret = -EBUSY;
3585         int enlarge = 0;
3586
3587         /* see mem_cgroup_resize_res_limit */
3588         retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3589         oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3590         while (retry_count) {
3591                 if (signal_pending(current)) {
3592                         ret = -EINTR;
3593                         break;
3594                 }
3595                 /*
3596                  * Rather than hide all in some function, I do this in
3597                  * open coded manner. You see what this really does.
3598                  * We have to guarantee memcg->res.limit < memcg->memsw.limit.
3599                  */
3600                 mutex_lock(&set_limit_mutex);
3601                 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3602                 if (memlimit > val) {
3603                         ret = -EINVAL;
3604                         mutex_unlock(&set_limit_mutex);
3605                         break;
3606                 }
3607                 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3608                 if (memswlimit < val)
3609                         enlarge = 1;
3610                 ret = res_counter_set_limit(&memcg->memsw, val);
3611                 if (!ret) {
3612                         if (memlimit == val)
3613                                 memcg->memsw_is_minimum = true;
3614                         else
3615                                 memcg->memsw_is_minimum = false;
3616                 }
3617                 mutex_unlock(&set_limit_mutex);
3618
3619                 if (!ret)
3620                         break;
3621
3622                 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3623                                                 MEM_CGROUP_RECLAIM_NOSWAP |
3624                                                 MEM_CGROUP_RECLAIM_SHRINK,
3625                                                 NULL);
3626                 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3627                 /* Usage is reduced ? */
3628                 if (curusage >= oldusage)
3629                         retry_count--;
3630                 else
3631                         oldusage = curusage;
3632         }
3633         if (!ret && enlarge)
3634                 memcg_oom_recover(memcg);
3635         return ret;
3636 }
3637
3638 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3639                                             gfp_t gfp_mask,
3640                                             unsigned long *total_scanned)
3641 {
3642         unsigned long nr_reclaimed = 0;
3643         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3644         unsigned long reclaimed;
3645         int loop = 0;
3646         struct mem_cgroup_tree_per_zone *mctz;
3647         unsigned long long excess;
3648         unsigned long nr_scanned;
3649
3650         if (order > 0)
3651                 return 0;
3652
3653         mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3654         /*
3655          * This loop can run a while, specially if mem_cgroup's continuously
3656          * keep exceeding their soft limit and putting the system under
3657          * pressure
3658          */
3659         do {
3660                 if (next_mz)
3661                         mz = next_mz;
3662                 else
3663                         mz = mem_cgroup_largest_soft_limit_node(mctz);
3664                 if (!mz)
3665                         break;
3666
3667                 nr_scanned = 0;
3668                 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3669                                                 gfp_mask,
3670                                                 MEM_CGROUP_RECLAIM_SOFT,
3671                                                 &nr_scanned);
3672                 nr_reclaimed += reclaimed;
3673                 *total_scanned += nr_scanned;
3674                 spin_lock(&mctz->lock);
3675
3676                 /*
3677                  * If we failed to reclaim anything from this memory cgroup
3678                  * it is time to move on to the next cgroup
3679                  */
3680                 next_mz = NULL;
3681                 if (!reclaimed) {
3682                         do {
3683                                 /*
3684                                  * Loop until we find yet another one.
3685                                  *
3686                                  * By the time we get the soft_limit lock
3687                                  * again, someone might have aded the
3688                                  * group back on the RB tree. Iterate to
3689                                  * make sure we get a different mem.
3690                                  * mem_cgroup_largest_soft_limit_node returns
3691                                  * NULL if no other cgroup is present on
3692                                  * the tree
3693                                  */
3694                                 next_mz =
3695                                 __mem_cgroup_largest_soft_limit_node(mctz);
3696                                 if (next_mz == mz)
3697                                         css_put(&next_mz->mem->css);
3698                                 else /* next_mz == NULL or other memcg */
3699                                         break;
3700                         } while (1);
3701                 }
3702                 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3703                 excess = res_counter_soft_limit_excess(&mz->mem->res);
3704                 /*
3705                  * One school of thought says that we should not add
3706                  * back the node to the tree if reclaim returns 0.
3707                  * But our reclaim could return 0, simply because due
3708                  * to priority we are exposing a smaller subset of
3709                  * memory to reclaim from. Consider this as a longer
3710                  * term TODO.
3711                  */
3712                 /* If excess == 0, no tree ops */
3713                 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3714                 spin_unlock(&mctz->lock);
3715                 css_put(&mz->mem->css);
3716                 loop++;
3717                 /*
3718                  * Could not reclaim anything and there are no more
3719                  * mem cgroups to try or we seem to be looping without
3720                  * reclaiming anything.
3721                  */
3722                 if (!nr_reclaimed &&
3723                         (next_mz == NULL ||
3724                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3725                         break;
3726         } while (!nr_reclaimed);
3727         if (next_mz)
3728                 css_put(&next_mz->mem->css);
3729         return nr_reclaimed;
3730 }
3731
3732 /*
3733  * This routine traverse page_cgroup in given list and drop them all.
3734  * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3735  */
3736 static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3737                                 int node, int zid, enum lru_list lru)
3738 {
3739         struct zone *zone;
3740         struct mem_cgroup_per_zone *mz;
3741         struct page_cgroup *pc, *busy;
3742         unsigned long flags, loop;
3743         struct list_head *list;
3744         int ret = 0;
3745
3746         zone = &NODE_DATA(node)->node_zones[zid];
3747         mz = mem_cgroup_zoneinfo(memcg, node, zid);
3748         list = &mz->lists[lru];
3749
3750         loop = MEM_CGROUP_ZSTAT(mz, lru);
3751         /* give some margin against EBUSY etc...*/
3752         loop += 256;
3753         busy = NULL;
3754         while (loop--) {
3755                 struct page *page;
3756
3757                 ret = 0;
3758                 spin_lock_irqsave(&zone->lru_lock, flags);
3759                 if (list_empty(list)) {
3760                         spin_unlock_irqrestore(&zone->lru_lock, flags);
3761                         break;
3762                 }
3763                 pc = list_entry(list->prev, struct page_cgroup, lru);
3764                 if (busy == pc) {
3765                         list_move(&pc->lru, list);
3766                         busy = NULL;
3767                         spin_unlock_irqrestore(&zone->lru_lock, flags);
3768                         continue;
3769                 }
3770                 spin_unlock_irqrestore(&zone->lru_lock, flags);
3771
3772                 page = lookup_cgroup_page(pc);
3773
3774                 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3775                 if (ret == -ENOMEM)
3776                         break;
3777
3778                 if (ret == -EBUSY || ret == -EINVAL) {
3779                         /* found lock contention or "pc" is obsolete. */
3780                         busy = pc;
3781                         cond_resched();
3782                 } else
3783                         busy = NULL;
3784         }
3785
3786         if (!ret && !list_empty(list))
3787                 return -EBUSY;
3788         return ret;
3789 }
3790
3791 /*
3792  * make mem_cgroup's charge to be 0 if there is no task.
3793  * This enables deleting this mem_cgroup.
3794  */
3795 static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
3796 {
3797         int ret;
3798         int node, zid, shrink;
3799         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3800         struct cgroup *cgrp = memcg->css.cgroup;
3801
3802         css_get(&memcg->css);
3803
3804         shrink = 0;
3805         /* should free all ? */
3806         if (free_all)
3807                 goto try_to_free;
3808 move_account:
3809         do {
3810                 ret = -EBUSY;
3811                 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3812                         goto out;
3813                 ret = -EINTR;
3814                 if (signal_pending(current))
3815                         goto out;
3816                 /* This is for making all *used* pages to be on LRU. */
3817                 lru_add_drain_all();
3818                 drain_all_stock_sync(memcg);
3819                 ret = 0;
3820                 mem_cgroup_start_move(memcg);
3821                 for_each_node_state(node, N_HIGH_MEMORY) {
3822                         for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3823                                 enum lru_list l;
3824                                 for_each_lru(l) {
3825                                         ret = mem_cgroup_force_empty_list(memcg,
3826                                                         node, zid, l);
3827                                         if (ret)
3828                                                 break;
3829                                 }
3830                         }
3831                         if (ret)
3832                                 break;
3833                 }
3834                 mem_cgroup_end_move(memcg);
3835                 memcg_oom_recover(memcg);
3836                 /* it seems parent cgroup doesn't have enough mem */
3837                 if (ret == -ENOMEM)
3838                         goto try_to_free;
3839                 cond_resched();
3840         /* "ret" should also be checked to ensure all lists are empty. */
3841         } while (memcg->res.usage > 0 || ret);
3842 out:
3843         css_put(&memcg->css);
3844         return ret;
3845
3846 try_to_free:
3847         /* returns EBUSY if there is a task or if we come here twice. */
3848         if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3849                 ret = -EBUSY;
3850                 goto out;
3851         }
3852         /* we call try-to-free pages for make this cgroup empty */
3853         lru_add_drain_all();
3854         /* try to free all pages in this cgroup */
3855         shrink = 1;
3856         while (nr_retries && memcg->res.usage > 0) {
3857                 int progress;
3858
3859                 if (signal_pending(current)) {
3860                         ret = -EINTR;
3861                         goto out;
3862                 }
3863                 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
3864                                                 false);
3865                 if (!progress) {
3866                         nr_retries--;
3867                         /* maybe some writeback is necessary */
3868                         congestion_wait(BLK_RW_ASYNC, HZ/10);
3869                 }
3870
3871         }
3872         lru_add_drain();
3873         /* try move_account...there may be some *locked* pages. */
3874         goto move_account;
3875 }
3876
3877 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3878 {
3879         return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3880 }
3881
3882
3883 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3884 {
3885         return mem_cgroup_from_cont(cont)->use_hierarchy;
3886 }
3887
3888 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3889                                         u64 val)
3890 {
3891         int retval = 0;
3892         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3893         struct cgroup *parent = cont->parent;
3894         struct mem_cgroup *parent_memcg = NULL;
3895
3896         if (parent)
3897                 parent_memcg = mem_cgroup_from_cont(parent);
3898
3899         cgroup_lock();
3900         /*
3901          * If parent's use_hierarchy is set, we can't make any modifications
3902          * in the child subtrees. If it is unset, then the change can
3903          * occur, provided the current cgroup has no children.
3904          *
3905          * For the root cgroup, parent_mem is NULL, we allow value to be
3906          * set if there are no children.
3907          */
3908         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3909                                 (val == 1 || val == 0)) {
3910                 if (list_empty(&cont->children))
3911                         memcg->use_hierarchy = val;
3912                 else
3913                         retval = -EBUSY;
3914         } else
3915                 retval = -EINVAL;
3916         cgroup_unlock();
3917
3918         return retval;
3919 }
3920
3921
3922 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
3923                                                enum mem_cgroup_stat_index idx)
3924 {
3925         struct mem_cgroup *iter;
3926         long val = 0;
3927
3928         /* Per-cpu values can be negative, use a signed accumulator */
3929         for_each_mem_cgroup_tree(iter, memcg)
3930                 val += mem_cgroup_read_stat(iter, idx);
3931
3932         if (val < 0) /* race ? */
3933                 val = 0;
3934         return val;
3935 }
3936
3937 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3938 {
3939         u64 val;
3940
3941         if (!mem_cgroup_is_root(memcg)) {
3942                 if (!swap)
3943                         return res_counter_read_u64(&memcg->res, RES_USAGE);
3944                 else
3945                         return res_counter_read_u64(&memcg->memsw, RES_USAGE);
3946         }
3947
3948         val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
3949         val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
3950
3951         if (swap)
3952                 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
3953
3954         return val << PAGE_SHIFT;
3955 }
3956
3957 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3958 {
3959         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3960         u64 val;
3961         int type, name;
3962
3963         type = MEMFILE_TYPE(cft->private);
3964         name = MEMFILE_ATTR(cft->private);
3965         switch (type) {
3966         case _MEM:
3967                 if (name == RES_USAGE)
3968                         val = mem_cgroup_usage(memcg, false);
3969                 else
3970                         val = res_counter_read_u64(&memcg->res, name);
3971                 break;
3972         case _MEMSWAP:
3973                 if (name == RES_USAGE)
3974                         val = mem_cgroup_usage(memcg, true);
3975                 else
3976                         val = res_counter_read_u64(&memcg->memsw, name);
3977                 break;
3978         default:
3979                 BUG();
3980                 break;
3981         }
3982         return val;
3983 }
3984 /*
3985  * The user of this function is...
3986  * RES_LIMIT.
3987  */
3988 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3989                             const char *buffer)
3990 {
3991         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3992         int type, name;
3993         unsigned long long val;
3994         int ret;
3995
3996         type = MEMFILE_TYPE(cft->private);
3997         name = MEMFILE_ATTR(cft->private);
3998         switch (name) {
3999         case RES_LIMIT:
4000                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
4001                         ret = -EINVAL;
4002                         break;
4003                 }
4004                 /* This function does all necessary parse...reuse it */
4005                 ret = res_counter_memparse_write_strategy(buffer, &val);
4006                 if (ret)
4007                         break;
4008                 if (type == _MEM)
4009                         ret = mem_cgroup_resize_limit(memcg, val);
4010                 else
4011                         ret = mem_cgroup_resize_memsw_limit(memcg, val);
4012                 break;
4013         case RES_SOFT_LIMIT:
4014                 ret = res_counter_memparse_write_strategy(buffer, &val);
4015                 if (ret)
4016                         break;
4017                 /*
4018                  * For memsw, soft limits are hard to implement in terms
4019                  * of semantics, for now, we support soft limits for
4020                  * control without swap
4021                  */
4022                 if (type == _MEM)
4023                         ret = res_counter_set_soft_limit(&memcg->res, val);
4024                 else
4025                         ret = -EINVAL;
4026                 break;
4027         default:
4028                 ret = -EINVAL; /* should be BUG() ? */
4029                 break;
4030         }
4031         return ret;
4032 }
4033
4034 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4035                 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4036 {
4037         struct cgroup *cgroup;
4038         unsigned long long min_limit, min_memsw_limit, tmp;
4039
4040         min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4041         min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4042         cgroup = memcg->css.cgroup;
4043         if (!memcg->use_hierarchy)
4044                 goto out;
4045
4046         while (cgroup->parent) {
4047                 cgroup = cgroup->parent;
4048                 memcg = mem_cgroup_from_cont(cgroup);
4049                 if (!memcg->use_hierarchy)
4050                         break;
4051                 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4052                 min_limit = min(min_limit, tmp);
4053                 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4054                 min_memsw_limit = min(min_memsw_limit, tmp);
4055         }
4056 out:
4057         *mem_limit = min_limit;
4058         *memsw_limit = min_memsw_limit;
4059         return;
4060 }
4061
4062 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4063 {
4064         struct mem_cgroup *memcg;
4065         int type, name;
4066
4067         memcg = mem_cgroup_from_cont(cont);
4068         type = MEMFILE_TYPE(event);
4069         name = MEMFILE_ATTR(event);
4070         switch (name) {
4071         case RES_MAX_USAGE:
4072                 if (type == _MEM)
4073                         res_counter_reset_max(&memcg->res);
4074                 else
4075                         res_counter_reset_max(&memcg->memsw);
4076                 break;
4077         case RES_FAILCNT:
4078                 if (type == _MEM)
4079                         res_counter_reset_failcnt(&memcg->res);
4080                 else
4081                         res_counter_reset_failcnt(&memcg->memsw);
4082                 break;
4083         }
4084
4085         return 0;
4086 }
4087
4088 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4089                                         struct cftype *cft)
4090 {
4091         return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
4092 }
4093
4094 #ifdef CONFIG_MMU
4095 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4096                                         struct cftype *cft, u64 val)
4097 {
4098         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4099
4100         if (val >= (1 << NR_MOVE_TYPE))
4101                 return -EINVAL;
4102         /*
4103          * We check this value several times in both in can_attach() and
4104          * attach(), so we need cgroup lock to prevent this value from being
4105          * inconsistent.
4106          */
4107         cgroup_lock();
4108         memcg->move_charge_at_immigrate = val;
4109         cgroup_unlock();
4110
4111         return 0;
4112 }
4113 #else
4114 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4115                                         struct cftype *cft, u64 val)
4116 {
4117         return -ENOSYS;
4118 }
4119 #endif
4120
4121
4122 /* For read statistics */
4123 enum {
4124         MCS_CACHE,
4125         MCS_RSS,
4126         MCS_FILE_MAPPED,
4127         MCS_PGPGIN,
4128         MCS_PGPGOUT,
4129         MCS_SWAP,
4130         MCS_PGFAULT,
4131         MCS_PGMAJFAULT,
4132         MCS_INACTIVE_ANON,
4133         MCS_ACTIVE_ANON,
4134         MCS_INACTIVE_FILE,
4135         MCS_ACTIVE_FILE,
4136         MCS_UNEVICTABLE,
4137         NR_MCS_STAT,
4138 };
4139
4140 struct mcs_total_stat {
4141         s64 stat[NR_MCS_STAT];
4142 };
4143
4144 struct {
4145         char *local_name;
4146         char *total_name;
4147 } memcg_stat_strings[NR_MCS_STAT] = {
4148         {"cache", "total_cache"},
4149         {"rss", "total_rss"},
4150         {"mapped_file", "total_mapped_file"},
4151         {"pgpgin", "total_pgpgin"},
4152         {"pgpgout", "total_pgpgout"},
4153         {"swap", "total_swap"},
4154         {"pgfault", "total_pgfault"},
4155         {"pgmajfault", "total_pgmajfault"},
4156         {"inactive_anon", "total_inactive_anon"},
4157         {"active_anon", "total_active_anon"},
4158         {"inactive_file", "total_inactive_file"},
4159         {"active_file", "total_active_file"},
4160         {"unevictable", "total_unevictable"}
4161 };
4162
4163
4164 static void
4165 mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4166 {
4167         s64 val;
4168
4169         /* per cpu stat */
4170         val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4171         s->stat[MCS_CACHE] += val * PAGE_SIZE;
4172         val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4173         s->stat[MCS_RSS] += val * PAGE_SIZE;
4174         val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4175         s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4176         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4177         s->stat[MCS_PGPGIN] += val;
4178         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4179         s->stat[MCS_PGPGOUT] += val;
4180         if (do_swap_account) {
4181                 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4182                 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4183         }
4184         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4185         s->stat[MCS_PGFAULT] += val;
4186         val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4187         s->stat[MCS_PGMAJFAULT] += val;
4188
4189         /* per zone stat */
4190         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4191         s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4192         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4193         s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4194         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4195         s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4196         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4197         s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4198         val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4199         s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4200 }
4201
4202 static void
4203 mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4204 {
4205         struct mem_cgroup *iter;
4206
4207         for_each_mem_cgroup_tree(iter, memcg)
4208                 mem_cgroup_get_local_stat(iter, s);
4209 }
4210
4211 #ifdef CONFIG_NUMA
4212 static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4213 {
4214         int nid;
4215         unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4216         unsigned long node_nr;
4217         struct cgroup *cont = m->private;
4218         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4219
4220         total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4221         seq_printf(m, "total=%lu", total_nr);
4222         for_each_node_state(nid, N_HIGH_MEMORY) {
4223                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4224                 seq_printf(m, " N%d=%lu", nid, node_nr);
4225         }
4226         seq_putc(m, '\n');
4227
4228         file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4229         seq_printf(m, "file=%lu", file_nr);
4230         for_each_node_state(nid, N_HIGH_MEMORY) {
4231                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4232                                 LRU_ALL_FILE);
4233                 seq_printf(m, " N%d=%lu", nid, node_nr);
4234         }
4235         seq_putc(m, '\n');
4236
4237         anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4238         seq_printf(m, "anon=%lu", anon_nr);
4239         for_each_node_state(nid, N_HIGH_MEMORY) {
4240                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4241                                 LRU_ALL_ANON);
4242                 seq_printf(m, " N%d=%lu", nid, node_nr);
4243         }
4244         seq_putc(m, '\n');
4245
4246         unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4247         seq_printf(m, "unevictable=%lu", unevictable_nr);
4248         for_each_node_state(nid, N_HIGH_MEMORY) {
4249                 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4250                                 BIT(LRU_UNEVICTABLE));
4251                 seq_printf(m, " N%d=%lu", nid, node_nr);
4252         }
4253         seq_putc(m, '\n');
4254         return 0;
4255 }
4256 #endif /* CONFIG_NUMA */
4257
4258 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4259                                  struct cgroup_map_cb *cb)
4260 {
4261         struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4262         struct mcs_total_stat mystat;
4263         int i;
4264
4265         memset(&mystat, 0, sizeof(mystat));
4266         mem_cgroup_get_local_stat(mem_cont, &mystat);
4267
4268
4269         for (i = 0; i < NR_MCS_STAT; i++) {
4270                 if (i == MCS_SWAP && !do_swap_account)
4271                         continue;
4272                 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
4273         }
4274
4275         /* Hierarchical information */
4276         {
4277                 unsigned long long limit, memsw_limit;
4278                 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
4279                 cb->fill(cb, "hierarchical_memory_limit", limit);
4280                 if (do_swap_account)
4281                         cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4282         }
4283
4284         memset(&mystat, 0, sizeof(mystat));
4285         mem_cgroup_get_total_stat(mem_cont, &mystat);
4286         for (i = 0; i < NR_MCS_STAT; i++) {
4287                 if (i == MCS_SWAP && !do_swap_account)
4288                         continue;
4289                 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
4290         }
4291
4292 #ifdef CONFIG_DEBUG_VM
4293         {
4294                 int nid, zid;
4295                 struct mem_cgroup_per_zone *mz;
4296                 unsigned long recent_rotated[2] = {0, 0};
4297                 unsigned long recent_scanned[2] = {0, 0};
4298
4299                 for_each_online_node(nid)
4300                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4301                                 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
4302
4303                                 recent_rotated[0] +=
4304                                         mz->reclaim_stat.recent_rotated[0];
4305                                 recent_rotated[1] +=
4306                                         mz->reclaim_stat.recent_rotated[1];
4307                                 recent_scanned[0] +=
4308                                         mz->reclaim_stat.recent_scanned[0];
4309                                 recent_scanned[1] +=
4310                                         mz->reclaim_stat.recent_scanned[1];
4311                         }
4312                 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4313                 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
4314                 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
4315                 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
4316         }
4317 #endif
4318
4319         return 0;
4320 }
4321
4322 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4323 {
4324         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4325
4326         return mem_cgroup_swappiness(memcg);
4327 }
4328
4329 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4330                                        u64 val)
4331 {
4332         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4333         struct mem_cgroup *parent;
4334
4335         if (val > 100)
4336                 return -EINVAL;
4337
4338         if (cgrp->parent == NULL)
4339                 return -EINVAL;
4340
4341         parent = mem_cgroup_from_cont(cgrp->parent);
4342
4343         cgroup_lock();
4344
4345         /* If under hierarchy, only empty-root can set this value */
4346         if ((parent->use_hierarchy) ||
4347             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4348                 cgroup_unlock();
4349                 return -EINVAL;
4350         }
4351
4352         memcg->swappiness = val;
4353
4354         cgroup_unlock();
4355
4356         return 0;
4357 }
4358
4359 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4360 {
4361         struct mem_cgroup_threshold_ary *t;
4362         u64 usage;
4363         int i;
4364
4365         rcu_read_lock();
4366         if (!swap)
4367                 t = rcu_dereference(memcg->thresholds.primary);
4368         else
4369                 t = rcu_dereference(memcg->memsw_thresholds.primary);
4370
4371         if (!t)
4372                 goto unlock;
4373
4374         usage = mem_cgroup_usage(memcg, swap);
4375
4376         /*
4377          * current_threshold points to threshold just below usage.
4378          * If it's not true, a threshold was crossed after last
4379          * call of __mem_cgroup_threshold().
4380          */
4381         i = t->current_threshold;
4382
4383         /*
4384          * Iterate backward over array of thresholds starting from
4385          * current_threshold and check if a threshold is crossed.
4386          * If none of thresholds below usage is crossed, we read
4387          * only one element of the array here.
4388          */
4389         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4390                 eventfd_signal(t->entries[i].eventfd, 1);
4391
4392         /* i = current_threshold + 1 */
4393         i++;
4394
4395         /*
4396          * Iterate forward over array of thresholds starting from
4397          * current_threshold+1 and check if a threshold is crossed.
4398          * If none of thresholds above usage is crossed, we read
4399          * only one element of the array here.
4400          */
4401         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4402                 eventfd_signal(t->entries[i].eventfd, 1);
4403
4404         /* Update current_threshold */
4405         t->current_threshold = i - 1;
4406 unlock:
4407         rcu_read_unlock();
4408 }
4409
4410 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4411 {
4412         while (memcg) {
4413                 __mem_cgroup_threshold(memcg, false);
4414                 if (do_swap_account)
4415                         __mem_cgroup_threshold(memcg, true);
4416
4417                 memcg = parent_mem_cgroup(memcg);
4418         }
4419 }
4420
4421 static int compare_thresholds(const void *a, const void *b)
4422 {
4423         const struct mem_cgroup_threshold *_a = a;
4424         const struct mem_cgroup_threshold *_b = b;
4425
4426         return _a->threshold - _b->threshold;
4427 }
4428
4429 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4430 {
4431         struct mem_cgroup_eventfd_list *ev;
4432
4433         list_for_each_entry(ev, &memcg->oom_notify, list)
4434                 eventfd_signal(ev->eventfd, 1);
4435         return 0;
4436 }
4437
4438 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4439 {
4440         struct mem_cgroup *iter;
4441
4442         for_each_mem_cgroup_tree(iter, memcg)
4443                 mem_cgroup_oom_notify_cb(iter);
4444 }
4445
4446 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4447         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4448 {
4449         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4450         struct mem_cgroup_thresholds *thresholds;
4451         struct mem_cgroup_threshold_ary *new;
4452         int type = MEMFILE_TYPE(cft->private);
4453         u64 threshold, usage;
4454         int i, size, ret;
4455
4456         ret = res_counter_memparse_write_strategy(args, &threshold);
4457         if (ret)
4458                 return ret;
4459
4460         mutex_lock(&memcg->thresholds_lock);
4461
4462         if (type == _MEM)
4463                 thresholds = &memcg->thresholds;
4464         else if (type == _MEMSWAP)
4465                 thresholds = &memcg->memsw_thresholds;
4466         else
4467                 BUG();
4468
4469         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4470
4471         /* Check if a threshold crossed before adding a new one */
4472         if (thresholds->primary)
4473                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4474
4475         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4476
4477         /* Allocate memory for new array of thresholds */
4478         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4479                         GFP_KERNEL);
4480         if (!new) {
4481                 ret = -ENOMEM;
4482                 goto unlock;
4483         }
4484         new->size = size;
4485
4486         /* Copy thresholds (if any) to new array */
4487         if (thresholds->primary) {
4488                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4489                                 sizeof(struct mem_cgroup_threshold));
4490         }
4491
4492         /* Add new threshold */
4493         new->entries[size - 1].eventfd = eventfd;
4494         new->entries[size - 1].threshold = threshold;
4495
4496         /* Sort thresholds. Registering of new threshold isn't time-critical */
4497         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4498                         compare_thresholds, NULL);
4499
4500         /* Find current threshold */
4501         new->current_threshold = -1;
4502         for (i = 0; i < size; i++) {
4503                 if (new->entries[i].threshold < usage) {
4504                         /*
4505                          * new->current_threshold will not be used until
4506                          * rcu_assign_pointer(), so it's safe to increment
4507                          * it here.
4508                          */
4509                         ++new->current_threshold;
4510                 }
4511         }
4512
4513         /* Free old spare buffer and save old primary buffer as spare */
4514         kfree(thresholds->spare);
4515         thresholds->spare = thresholds->primary;
4516
4517         rcu_assign_pointer(thresholds->primary, new);
4518
4519         /* To be sure that nobody uses thresholds */
4520         synchronize_rcu();
4521
4522 unlock:
4523         mutex_unlock(&memcg->thresholds_lock);
4524
4525         return ret;
4526 }
4527
4528 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4529         struct cftype *cft, struct eventfd_ctx *eventfd)
4530 {
4531         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4532         struct mem_cgroup_thresholds *thresholds;
4533         struct mem_cgroup_threshold_ary *new;
4534         int type = MEMFILE_TYPE(cft->private);
4535         u64 usage;
4536         int i, j, size;
4537
4538         mutex_lock(&memcg->thresholds_lock);
4539         if (type == _MEM)
4540                 thresholds = &memcg->thresholds;
4541         else if (type == _MEMSWAP)
4542                 thresholds = &memcg->memsw_thresholds;
4543         else
4544                 BUG();
4545
4546         /*
4547          * Something went wrong if we trying to unregister a threshold
4548          * if we don't have thresholds
4549          */
4550         BUG_ON(!thresholds);
4551
4552         usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4553
4554         /* Check if a threshold crossed before removing */
4555         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4556
4557         /* Calculate new number of threshold */
4558         size = 0;
4559         for (i = 0; i < thresholds->primary->size; i++) {
4560                 if (thresholds->primary->entries[i].eventfd != eventfd)
4561                         size++;
4562         }
4563
4564         new = thresholds->spare;
4565
4566         /* Set thresholds array to NULL if we don't have thresholds */
4567         if (!size) {
4568                 kfree(new);
4569                 new = NULL;
4570                 goto swap_buffers;
4571         }
4572
4573         new->size = size;
4574
4575         /* Copy thresholds and find current threshold */
4576         new->current_threshold = -1;
4577         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4578                 if (thresholds->primary->entries[i].eventfd == eventfd)
4579                         continue;
4580
4581                 new->entries[j] = thresholds->primary->entries[i];
4582                 if (new->entries[j].threshold < usage) {
4583                         /*
4584                          * new->current_threshold will not be used
4585                          * until rcu_assign_pointer(), so it's safe to increment
4586                          * it here.
4587                          */
4588                         ++new->current_threshold;
4589                 }
4590                 j++;
4591         }
4592
4593 swap_buffers:
4594         /* Swap primary and spare array */
4595         thresholds->spare = thresholds->primary;
4596         rcu_assign_pointer(thresholds->primary, new);
4597
4598         /* To be sure that nobody uses thresholds */
4599         synchronize_rcu();
4600
4601         mutex_unlock(&memcg->thresholds_lock);
4602 }
4603
4604 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4605         struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4606 {
4607         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4608         struct mem_cgroup_eventfd_list *event;
4609         int type = MEMFILE_TYPE(cft->private);
4610
4611         BUG_ON(type != _OOM_TYPE);
4612         event = kmalloc(sizeof(*event), GFP_KERNEL);
4613         if (!event)
4614                 return -ENOMEM;
4615
4616         spin_lock(&memcg_oom_lock);
4617
4618         event->eventfd = eventfd;
4619         list_add(&event->list, &memcg->oom_notify);
4620
4621         /* already in OOM ? */
4622         if (atomic_read(&memcg->under_oom))
4623                 eventfd_signal(eventfd, 1);
4624         spin_unlock(&memcg_oom_lock);
4625
4626         return 0;
4627 }
4628
4629 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4630         struct cftype *cft, struct eventfd_ctx *eventfd)
4631 {
4632         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4633         struct mem_cgroup_eventfd_list *ev, *tmp;
4634         int type = MEMFILE_TYPE(cft->private);
4635
4636         BUG_ON(type != _OOM_TYPE);
4637
4638         spin_lock(&memcg_oom_lock);
4639
4640         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4641                 if (ev->eventfd == eventfd) {
4642                         list_del(&ev->list);
4643                         kfree(ev);
4644                 }
4645         }
4646
4647         spin_unlock(&memcg_oom_lock);
4648 }
4649
4650 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4651         struct cftype *cft,  struct cgroup_map_cb *cb)
4652 {
4653         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4654
4655         cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
4656
4657         if (atomic_read(&memcg->under_oom))
4658                 cb->fill(cb, "under_oom", 1);
4659         else
4660                 cb->fill(cb, "under_oom", 0);
4661         return 0;
4662 }
4663
4664 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4665         struct cftype *cft, u64 val)
4666 {
4667         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4668         struct mem_cgroup *parent;
4669
4670         /* cannot set to root cgroup and only 0 and 1 are allowed */
4671         if (!cgrp->parent || !((val == 0) || (val == 1)))
4672                 return -EINVAL;
4673
4674         parent = mem_cgroup_from_cont(cgrp->parent);
4675
4676         cgroup_lock();
4677         /* oom-kill-disable is a flag for subhierarchy. */
4678         if ((parent->use_hierarchy) ||
4679             (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4680                 cgroup_unlock();
4681                 return -EINVAL;
4682         }
4683         memcg->oom_kill_disable = val;
4684         if (!val)
4685                 memcg_oom_recover(memcg);
4686         cgroup_unlock();
4687         return 0;
4688 }
4689
4690 #ifdef CONFIG_NUMA
4691 static const struct file_operations mem_control_numa_stat_file_operations = {
4692         .read = seq_read,
4693         .llseek = seq_lseek,
4694         .release = single_release,
4695 };
4696
4697 static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4698 {
4699         struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4700
4701         file->f_op = &mem_control_numa_stat_file_operations;
4702         return single_open(file, mem_control_numa_stat_show, cont);
4703 }
4704 #endif /* CONFIG_NUMA */
4705
4706 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4707 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4708 {
4709         /*
4710          * Part of this would be better living in a separate allocation
4711          * function, leaving us with just the cgroup tree population work.
4712          * We, however, depend on state such as network's proto_list that
4713          * is only initialized after cgroup creation. I found the less
4714          * cumbersome way to deal with it to defer it all to populate time
4715          */
4716         return mem_cgroup_sockets_init(cont, ss);
4717 };
4718
4719 static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4720                                 struct cgroup *cont)
4721 {
4722         mem_cgroup_sockets_destroy(cont, ss);
4723 }
4724 #else
4725 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
4726 {
4727         return 0;
4728 }
4729
4730 static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
4731                                 struct cgroup *cont)
4732 {
4733 }
4734 #endif
4735
4736 static struct cftype mem_cgroup_files[] = {
4737         {
4738                 .name = "usage_in_bytes",
4739                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4740                 .read_u64 = mem_cgroup_read,
4741                 .register_event = mem_cgroup_usage_register_event,
4742                 .unregister_event = mem_cgroup_usage_unregister_event,
4743         },
4744         {
4745                 .name = "max_usage_in_bytes",
4746                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4747                 .trigger = mem_cgroup_reset,
4748                 .read_u64 = mem_cgroup_read,
4749         },
4750         {
4751                 .name = "limit_in_bytes",
4752                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4753                 .write_string = mem_cgroup_write,
4754                 .read_u64 = mem_cgroup_read,
4755         },
4756         {
4757                 .name = "soft_limit_in_bytes",
4758                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4759                 .write_string = mem_cgroup_write,
4760                 .read_u64 = mem_cgroup_read,
4761         },
4762         {
4763                 .name = "failcnt",
4764                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4765                 .trigger = mem_cgroup_reset,
4766                 .read_u64 = mem_cgroup_read,
4767         },
4768         {
4769                 .name = "stat",
4770                 .read_map = mem_control_stat_show,
4771         },
4772         {
4773                 .name = "force_empty",
4774                 .trigger = mem_cgroup_force_empty_write,
4775         },
4776         {
4777                 .name = "use_hierarchy",
4778                 .write_u64 = mem_cgroup_hierarchy_write,
4779                 .read_u64 = mem_cgroup_hierarchy_read,
4780         },
4781         {
4782                 .name = "swappiness",
4783                 .read_u64 = mem_cgroup_swappiness_read,
4784                 .write_u64 = mem_cgroup_swappiness_write,
4785         },
4786         {
4787                 .name = "move_charge_at_immigrate",
4788                 .read_u64 = mem_cgroup_move_charge_read,
4789                 .write_u64 = mem_cgroup_move_charge_write,
4790         },
4791         {
4792                 .name = "oom_control",
4793                 .read_map = mem_cgroup_oom_control_read,
4794                 .write_u64 = mem_cgroup_oom_control_write,
4795                 .register_event = mem_cgroup_oom_register_event,
4796                 .unregister_event = mem_cgroup_oom_unregister_event,
4797                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4798         },
4799 #ifdef CONFIG_NUMA
4800         {
4801                 .name = "numa_stat",
4802                 .open = mem_control_numa_stat_open,
4803                 .mode = S_IRUGO,
4804         },
4805 #endif
4806 };
4807
4808 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4809 static struct cftype memsw_cgroup_files[] = {
4810         {
4811                 .name = "memsw.usage_in_bytes",
4812                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4813                 .read_u64 = mem_cgroup_read,
4814                 .register_event = mem_cgroup_usage_register_event,
4815                 .unregister_event = mem_cgroup_usage_unregister_event,
4816         },
4817         {
4818                 .name = "memsw.max_usage_in_bytes",
4819                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4820                 .trigger = mem_cgroup_reset,
4821                 .read_u64 = mem_cgroup_read,
4822         },
4823         {
4824                 .name = "memsw.limit_in_bytes",
4825                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4826                 .write_string = mem_cgroup_write,
4827                 .read_u64 = mem_cgroup_read,
4828         },
4829         {
4830                 .name = "memsw.failcnt",
4831                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4832                 .trigger = mem_cgroup_reset,
4833                 .read_u64 = mem_cgroup_read,
4834         },
4835 };
4836
4837 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4838 {
4839         if (!do_swap_account)
4840                 return 0;
4841         return cgroup_add_files(cont, ss, memsw_cgroup_files,
4842                                 ARRAY_SIZE(memsw_cgroup_files));
4843 };
4844 #else
4845 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4846 {
4847         return 0;
4848 }
4849 #endif
4850
4851 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4852 {
4853         struct mem_cgroup_per_node *pn;
4854         struct mem_cgroup_per_zone *mz;
4855         enum lru_list l;
4856         int zone, tmp = node;
4857         /*
4858          * This routine is called against possible nodes.
4859          * But it's BUG to call kmalloc() against offline node.
4860          *
4861          * TODO: this routine can waste much memory for nodes which will
4862          *       never be onlined. It's better to use memory hotplug callback
4863          *       function.
4864          */
4865         if (!node_state(node, N_NORMAL_MEMORY))
4866                 tmp = -1;
4867         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4868         if (!pn)
4869                 return 1;
4870
4871         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4872                 mz = &pn->zoneinfo[zone];
4873                 for_each_lru(l)
4874                         INIT_LIST_HEAD(&mz->lists[l]);
4875                 mz->usage_in_excess = 0;
4876                 mz->on_tree = false;
4877                 mz->mem = memcg;
4878         }
4879         memcg->info.nodeinfo[node] = pn;
4880         return 0;
4881 }
4882
4883 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4884 {
4885         kfree(memcg->info.nodeinfo[node]);
4886 }
4887
4888 static struct mem_cgroup *mem_cgroup_alloc(void)
4889 {
4890         struct mem_cgroup *mem;
4891         int size = sizeof(struct mem_cgroup);
4892
4893         /* Can be very big if MAX_NUMNODES is very big */
4894         if (size < PAGE_SIZE)
4895                 mem = kzalloc(size, GFP_KERNEL);
4896         else
4897                 mem = vzalloc(size);
4898
4899         if (!mem)
4900                 return NULL;
4901
4902         mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4903         if (!mem->stat)
4904                 goto out_free;
4905         spin_lock_init(&mem->pcp_counter_lock);
4906         return mem;
4907
4908 out_free:
4909         if (size < PAGE_SIZE)
4910                 kfree(mem);
4911         else
4912                 vfree(mem);
4913         return NULL;
4914 }
4915
4916 /*
4917  * At destroying mem_cgroup, references from swap_cgroup can remain.
4918  * (scanning all at force_empty is too costly...)
4919  *
4920  * Instead of clearing all references at force_empty, we remember
4921  * the number of reference from swap_cgroup and free mem_cgroup when
4922  * it goes down to 0.
4923  *
4924  * Removal of cgroup itself succeeds regardless of refs from swap.
4925  */
4926
4927 static void __mem_cgroup_free(struct mem_cgroup *memcg)
4928 {
4929         int node;
4930
4931         mem_cgroup_remove_from_trees(memcg);
4932         free_css_id(&mem_cgroup_subsys, &memcg->css);
4933
4934         for_each_node_state(node, N_POSSIBLE)
4935                 free_mem_cgroup_per_zone_info(memcg, node);
4936
4937         free_percpu(memcg->stat);
4938         if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4939                 kfree(memcg);
4940         else
4941                 vfree(memcg);
4942 }
4943
4944 static void mem_cgroup_get(struct mem_cgroup *memcg)
4945 {
4946         atomic_inc(&memcg->refcnt);
4947 }
4948
4949 static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
4950 {
4951         if (atomic_sub_and_test(count, &memcg->refcnt)) {
4952                 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4953                 __mem_cgroup_free(memcg);
4954                 if (parent)
4955                         mem_cgroup_put(parent);
4956         }
4957 }
4958
4959 static void mem_cgroup_put(struct mem_cgroup *memcg)
4960 {
4961         __mem_cgroup_put(memcg, 1);
4962 }
4963
4964 /*
4965  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4966  */
4967 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4968 {
4969         if (!memcg->res.parent)
4970                 return NULL;
4971         return mem_cgroup_from_res_counter(memcg->res.parent, res);
4972 }
4973 EXPORT_SYMBOL(parent_mem_cgroup);
4974
4975 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4976 static void __init enable_swap_cgroup(void)
4977 {
4978         if (!mem_cgroup_disabled() && really_do_swap_account)
4979                 do_swap_account = 1;
4980 }
4981 #else
4982 static void __init enable_swap_cgroup(void)
4983 {
4984 }
4985 #endif
4986
4987 static int mem_cgroup_soft_limit_tree_init(void)
4988 {
4989         struct mem_cgroup_tree_per_node *rtpn;
4990         struct mem_cgroup_tree_per_zone *rtpz;
4991         int tmp, node, zone;
4992
4993         for_each_node_state(node, N_POSSIBLE) {
4994                 tmp = node;
4995                 if (!node_state(node, N_NORMAL_MEMORY))
4996                         tmp = -1;
4997                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4998                 if (!rtpn)
4999                         return 1;
5000
5001                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5002
5003                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5004                         rtpz = &rtpn->rb_tree_per_zone[zone];
5005                         rtpz->rb_root = RB_ROOT;
5006                         spin_lock_init(&rtpz->lock);
5007                 }
5008         }
5009         return 0;
5010 }
5011
5012 static struct cgroup_subsys_state * __ref
5013 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5014 {
5015         struct mem_cgroup *memcg, *parent;
5016         long error = -ENOMEM;
5017         int node;
5018
5019         memcg = mem_cgroup_alloc();
5020         if (!memcg)
5021                 return ERR_PTR(error);
5022
5023         for_each_node_state(node, N_POSSIBLE)
5024                 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5025                         goto free_out;
5026
5027         /* root ? */
5028         if (cont->parent == NULL) {
5029                 int cpu;
5030                 enable_swap_cgroup();
5031                 parent = NULL;
5032                 if (mem_cgroup_soft_limit_tree_init())
5033                         goto free_out;
5034                 root_mem_cgroup = memcg;
5035                 for_each_possible_cpu(cpu) {
5036                         struct memcg_stock_pcp *stock =
5037                                                 &per_cpu(memcg_stock, cpu);
5038                         INIT_WORK(&stock->work, drain_local_stock);
5039                 }
5040                 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5041         } else {
5042                 parent = mem_cgroup_from_cont(cont->parent);
5043                 memcg->use_hierarchy = parent->use_hierarchy;
5044                 memcg->oom_kill_disable = parent->oom_kill_disable;
5045         }
5046
5047         if (parent && parent->use_hierarchy) {
5048                 res_counter_init(&memcg->res, &parent->res);
5049                 res_counter_init(&memcg->memsw, &parent->memsw);
5050                 /*
5051                  * We increment refcnt of the parent to ensure that we can
5052                  * safely access it on res_counter_charge/uncharge.
5053                  * This refcnt will be decremented when freeing this
5054                  * mem_cgroup(see mem_cgroup_put).
5055                  */
5056                 mem_cgroup_get(parent);
5057         } else {
5058                 res_counter_init(&memcg->res, NULL);
5059                 res_counter_init(&memcg->memsw, NULL);
5060         }
5061         memcg->last_scanned_node = MAX_NUMNODES;
5062         INIT_LIST_HEAD(&memcg->oom_notify);
5063
5064         if (parent)
5065                 memcg->swappiness = mem_cgroup_swappiness(parent);
5066         atomic_set(&memcg->refcnt, 1);
5067         memcg->move_charge_at_immigrate = 0;
5068         mutex_init(&memcg->thresholds_lock);
5069         return &memcg->css;
5070 free_out:
5071         __mem_cgroup_free(memcg);
5072         return ERR_PTR(error);
5073 }
5074
5075 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
5076                                         struct cgroup *cont)
5077 {
5078         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5079
5080         return mem_cgroup_force_empty(memcg, false);
5081 }
5082
5083 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
5084                                 struct cgroup *cont)
5085 {
5086         struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5087
5088         kmem_cgroup_destroy(ss, cont);
5089
5090         mem_cgroup_put(memcg);
5091 }
5092
5093 static int mem_cgroup_populate(struct cgroup_subsys *ss,
5094                                 struct cgroup *cont)
5095 {
5096         int ret;
5097
5098         ret = cgroup_add_files(cont, ss, mem_cgroup_files,
5099                                 ARRAY_SIZE(mem_cgroup_files));
5100
5101         if (!ret)
5102                 ret = register_memsw_files(cont, ss);
5103
5104         if (!ret)
5105                 ret = register_kmem_files(cont, ss);
5106
5107         return ret;
5108 }
5109
5110 #ifdef CONFIG_MMU
5111 /* Handlers for move charge at task migration. */
5112 #define PRECHARGE_COUNT_AT_ONCE 256
5113 static int mem_cgroup_do_precharge(unsigned long count)
5114 {
5115         int ret = 0;
5116         int batch_count = PRECHARGE_COUNT_AT_ONCE;
5117         struct mem_cgroup *memcg = mc.to;
5118
5119         if (mem_cgroup_is_root(memcg)) {
5120                 mc.precharge += count;
5121                 /* we don't need css_get for root */
5122                 return ret;
5123         }
5124         /* try to charge at once */
5125         if (count > 1) {
5126                 struct res_counter *dummy;
5127                 /*
5128                  * "memcg" cannot be under rmdir() because we've already checked
5129                  * by cgroup_lock_live_cgroup() that it is not removed and we
5130                  * are still under the same cgroup_mutex. So we can postpone
5131                  * css_get().
5132                  */
5133                 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
5134                         goto one_by_one;
5135                 if (do_swap_account && res_counter_charge(&memcg->memsw,
5136                                                 PAGE_SIZE * count, &dummy)) {
5137                         res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
5138                         goto one_by_one;
5139                 }
5140                 mc.precharge += count;
5141                 return ret;
5142         }
5143 one_by_one:
5144         /* fall back to one by one charge */
5145         while (count--) {
5146                 if (signal_pending(current)) {
5147                         ret = -EINTR;
5148                         break;
5149                 }
5150                 if (!batch_count--) {
5151                         batch_count = PRECHARGE_COUNT_AT_ONCE;
5152                         cond_resched();
5153                 }
5154                 ret = __mem_cgroup_try_charge(NULL,
5155                                         GFP_KERNEL, 1, &memcg, false);
5156                 if (ret || !memcg)
5157                         /* mem_cgroup_clear_mc() will do uncharge later */
5158                         return -ENOMEM;
5159                 mc.precharge++;
5160         }
5161         return ret;
5162 }
5163
5164 /**
5165  * is_target_pte_for_mc - check a pte whether it is valid for move charge
5166  * @vma: the vma the pte to be checked belongs
5167  * @addr: the address corresponding to the pte to be checked
5168  * @ptent: the pte to be checked
5169  * @target: the pointer the target page or swap ent will be stored(can be NULL)
5170  *
5171  * Returns
5172  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5173  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5174  *     move charge. if @target is not NULL, the page is stored in target->page
5175  *     with extra refcnt got(Callers should handle it).
5176  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5177  *     target for charge migration. if @target is not NULL, the entry is stored
5178  *     in target->ent.
5179  *
5180  * Called with pte lock held.
5181  */
5182 union mc_target {
5183         struct page     *page;
5184         swp_entry_t     ent;
5185 };
5186
5187 enum mc_target_type {
5188         MC_TARGET_NONE, /* not used */
5189         MC_TARGET_PAGE,
5190         MC_TARGET_SWAP,
5191 };
5192
5193 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5194                                                 unsigned long addr, pte_t ptent)
5195 {
5196         struct page *page = vm_normal_page(vma, addr, ptent);
5197
5198         if (!page || !page_mapped(page))
5199                 return NULL;
5200         if (PageAnon(page)) {
5201                 /* we don't move shared anon */
5202                 if (!move_anon() || page_mapcount(page) > 2)
5203                         return NULL;
5204         } else if (!move_file())
5205                 /* we ignore mapcount for file pages */
5206                 return NULL;
5207         if (!get_page_unless_zero(page))
5208                 return NULL;
5209
5210         return page;
5211 }
5212
5213 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5214                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
5215 {
5216         int usage_count;
5217         struct page *page = NULL;
5218         swp_entry_t ent = pte_to_swp_entry(ptent);
5219
5220         if (!move_anon() || non_swap_entry(ent))
5221                 return NULL;
5222         usage_count = mem_cgroup_count_swap_user(ent, &page);
5223         if (usage_count > 1) { /* we don't move shared anon */
5224                 if (page)
5225                         put_page(page);
5226                 return NULL;
5227         }
5228         if (do_swap_account)
5229                 entry->val = ent.val;
5230
5231         return page;
5232 }
5233
5234 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5235                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
5236 {
5237         struct page *page = NULL;
5238         struct inode *inode;
5239         struct address_space *mapping;
5240         pgoff_t pgoff;
5241
5242         if (!vma->vm_file) /* anonymous vma */
5243                 return NULL;
5244         if (!move_file())
5245                 return NULL;
5246
5247         inode = vma->vm_file->f_path.dentry->d_inode;
5248         mapping = vma->vm_file->f_mapping;
5249         if (pte_none(ptent))
5250                 pgoff = linear_page_index(vma, addr);
5251         else /* pte_file(ptent) is true */
5252                 pgoff = pte_to_pgoff(ptent);
5253
5254         /* page is moved even if it's not RSS of this task(page-faulted). */
5255         page = find_get_page(mapping, pgoff);
5256
5257 #ifdef CONFIG_SWAP
5258         /* shmem/tmpfs may report page out on swap: account for that too. */
5259         if (radix_tree_exceptional_entry(page)) {
5260                 swp_entry_t swap = radix_to_swp_entry(page);
5261                 if (do_swap_account)
5262                         *entry = swap;
5263                 page = find_get_page(&swapper_space, swap.val);
5264         }
5265 #endif
5266         return page;
5267 }
5268
5269 static int is_target_pte_for_mc(struct vm_area_struct *vma,
5270                 unsigned long addr, pte_t ptent, union mc_target *target)
5271 {
5272         struct page *page = NULL;
5273         struct page_cgroup *pc;
5274         int ret = 0;
5275         swp_entry_t ent = { .val = 0 };
5276
5277         if (pte_present(ptent))
5278                 page = mc_handle_present_pte(vma, addr, ptent);
5279         else if (is_swap_pte(ptent))
5280                 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5281         else if (pte_none(ptent) || pte_file(ptent))
5282                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5283
5284         if (!page && !ent.val)
5285                 return 0;
5286         if (page) {
5287                 pc = lookup_page_cgroup(page);
5288                 /*
5289                  * Do only loose check w/o page_cgroup lock.
5290                  * mem_cgroup_move_account() checks the pc is valid or not under
5291                  * the lock.
5292                  */
5293                 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5294                         ret = MC_TARGET_PAGE;
5295                         if (target)
5296                                 target->page = page;
5297                 }
5298                 if (!ret || !target)
5299                         put_page(page);
5300         }
5301         /* There is a swap entry and a page doesn't exist or isn't charged */
5302         if (ent.val && !ret &&
5303                         css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
5304                 ret = MC_TARGET_SWAP;
5305                 if (target)
5306                         target->ent = ent;
5307         }
5308         return ret;
5309 }
5310
5311 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5312                                         unsigned long addr, unsigned long end,
5313                                         struct mm_walk *walk)
5314 {
5315         struct vm_area_struct *vma = walk->private;
5316         pte_t *pte;
5317         spinlock_t *ptl;
5318
5319         split_huge_page_pmd(walk->mm, pmd);
5320
5321         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5322         for (; addr != end; pte++, addr += PAGE_SIZE)
5323                 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
5324                         mc.precharge++; /* increment precharge temporarily */
5325         pte_unmap_unlock(pte - 1, ptl);
5326         cond_resched();
5327
5328         return 0;
5329 }
5330
5331 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5332 {
5333         unsigned long precharge;
5334         struct vm_area_struct *vma;
5335
5336         down_read(&mm->mmap_sem);
5337         for (vma = mm->mmap; vma; vma = vma->vm_next) {
5338                 struct mm_walk mem_cgroup_count_precharge_walk = {
5339                         .pmd_entry = mem_cgroup_count_precharge_pte_range,
5340                         .mm = mm,
5341                         .private = vma,
5342                 };
5343                 if (is_vm_hugetlb_page(vma))
5344                         continue;
5345                 walk_page_range(vma->vm_start, vma->vm_end,
5346                                         &mem_cgroup_count_precharge_walk);
5347         }
5348         up_read(&mm->mmap_sem);
5349
5350         precharge = mc.precharge;
5351         mc.precharge = 0;
5352
5353         return precharge;
5354 }
5355
5356 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5357 {
5358         unsigned long precharge = mem_cgroup_count_precharge(mm);
5359
5360         VM_BUG_ON(mc.moving_task);
5361         mc.moving_task = current;
5362         return mem_cgroup_do_precharge(precharge);
5363 }
5364
5365 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5366 static void __mem_cgroup_clear_mc(void)
5367 {
5368         struct mem_cgroup *from = mc.from;
5369         struct mem_cgroup *to = mc.to;
5370
5371         /* we must uncharge all the leftover precharges from mc.to */
5372         if (mc.precharge) {
5373                 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
5374                 mc.precharge = 0;
5375         }
5376         /*
5377          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5378          * we must uncharge here.
5379          */
5380         if (mc.moved_charge) {
5381                 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5382                 mc.moved_charge = 0;
5383         }
5384         /* we must fixup refcnts and charges */
5385         if (mc.moved_swap) {
5386                 /* uncharge swap account from the old cgroup */
5387                 if (!mem_cgroup_is_root(mc.from))
5388                         res_counter_uncharge(&mc.from->memsw,
5389                                                 PAGE_SIZE * mc.moved_swap);
5390                 __mem_cgroup_put(mc.from, mc.moved_swap);
5391
5392                 if (!mem_cgroup_is_root(mc.to)) {
5393                         /*
5394                          * we charged both to->res and to->memsw, so we should
5395                          * uncharge to->res.
5396                          */
5397                         res_counter_uncharge(&mc.to->res,
5398                                                 PAGE_SIZE * mc.moved_swap);
5399                 }
5400                 /* we've already done mem_cgroup_get(mc.to) */
5401                 mc.moved_swap = 0;
5402         }
5403         memcg_oom_recover(from);
5404         memcg_oom_recover(to);
5405         wake_up_all(&mc.waitq);
5406 }
5407
5408 static void mem_cgroup_clear_mc(void)
5409 {
5410         struct mem_cgroup *from = mc.from;
5411
5412         /*
5413          * we must clear moving_task before waking up waiters at the end of
5414          * task migration.
5415          */
5416         mc.moving_task = NULL;
5417         __mem_cgroup_clear_mc();
5418         spin_lock(&mc.lock);
5419         mc.from = NULL;
5420         mc.to = NULL;
5421         spin_unlock(&mc.lock);
5422         mem_cgroup_end_move(from);
5423 }
5424
5425 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5426                                 struct cgroup *cgroup,
5427                                 struct cgroup_taskset *tset)
5428 {
5429         struct task_struct *p = cgroup_taskset_first(tset);
5430         int ret = 0;
5431         struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
5432
5433         if (memcg->move_charge_at_immigrate) {
5434                 struct mm_struct *mm;
5435                 struct mem_cgroup *from = mem_cgroup_from_task(p);
5436
5437                 VM_BUG_ON(from == memcg);
5438
5439                 mm = get_task_mm(p);
5440                 if (!mm)
5441                         return 0;
5442                 /* We move charges only when we move a owner of the mm */
5443                 if (mm->owner == p) {
5444                         VM_BUG_ON(mc.from);
5445                         VM_BUG_ON(mc.to);
5446                         VM_BUG_ON(mc.precharge);
5447                         VM_BUG_ON(mc.moved_charge);
5448                         VM_BUG_ON(mc.moved_swap);
5449                         mem_cgroup_start_move(from);
5450                         spin_lock(&mc.lock);
5451                         mc.from = from;
5452                         mc.to = memcg;
5453                         spin_unlock(&mc.lock);
5454                         /* We set mc.moving_task later */
5455
5456                         ret = mem_cgroup_precharge_mc(mm);
5457                         if (ret)
5458                                 mem_cgroup_clear_mc();
5459                 }
5460                 mmput(mm);
5461         }
5462         return ret;
5463 }
5464
5465 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5466                                 struct cgroup *cgroup,
5467                                 struct cgroup_taskset *tset)
5468 {
5469         mem_cgroup_clear_mc();
5470 }
5471
5472 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5473                                 unsigned long addr, unsigned long end,
5474                                 struct mm_walk *walk)
5475 {
5476         int ret = 0;
5477         struct vm_area_struct *vma = walk->private;
5478         pte_t *pte;
5479         spinlock_t *ptl;
5480
5481         split_huge_page_pmd(walk->mm, pmd);
5482 retry:
5483         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5484         for (; addr != end; addr += PAGE_SIZE) {
5485                 pte_t ptent = *(pte++);
5486                 union mc_target target;
5487                 int type;
5488                 struct page *page;
5489                 struct page_cgroup *pc;
5490                 swp_entry_t ent;
5491
5492                 if (!mc.precharge)
5493                         break;
5494
5495                 type = is_target_pte_for_mc(vma, addr, ptent, &target);
5496                 switch (type) {
5497                 case MC_TARGET_PAGE:
5498                         page = target.page;
5499                         if (isolate_lru_page(page))
5500                                 goto put;
5501                         pc = lookup_page_cgroup(page);
5502                         if (!mem_cgroup_move_account(page, 1, pc,
5503                                                      mc.from, mc.to, false)) {
5504                                 mc.precharge--;
5505                                 /* we uncharge from mc.from later. */
5506                                 mc.moved_charge++;
5507                         }
5508                         putback_lru_page(page);
5509 put:                    /* is_target_pte_for_mc() gets the page */
5510                         put_page(page);
5511                         break;
5512                 case MC_TARGET_SWAP:
5513                         ent = target.ent;
5514                         if (!mem_cgroup_move_swap_account(ent,
5515                                                 mc.from, mc.to, false)) {
5516                                 mc.precharge--;
5517                                 /* we fixup refcnts and charges later. */
5518                                 mc.moved_swap++;
5519                         }
5520                         break;
5521                 default:
5522                         break;
5523                 }
5524         }
5525         pte_unmap_unlock(pte - 1, ptl);
5526         cond_resched();
5527
5528         if (addr != end) {
5529                 /*
5530                  * We have consumed all precharges we got in can_attach().
5531                  * We try charge one by one, but don't do any additional
5532                  * charges to mc.to if we have failed in charge once in attach()
5533                  * phase.
5534                  */
5535                 ret = mem_cgroup_do_precharge(1);
5536                 if (!ret)
5537                         goto retry;
5538         }
5539
5540         return ret;
5541 }
5542
5543 static void mem_cgroup_move_charge(struct mm_struct *mm)
5544 {
5545         struct vm_area_struct *vma;
5546
5547         lru_add_drain_all();
5548 retry:
5549         if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5550                 /*
5551                  * Someone who are holding the mmap_sem might be waiting in
5552                  * waitq. So we cancel all extra charges, wake up all waiters,
5553                  * and retry. Because we cancel precharges, we might not be able
5554                  * to move enough charges, but moving charge is a best-effort
5555                  * feature anyway, so it wouldn't be a big problem.
5556                  */
5557                 __mem_cgroup_clear_mc();
5558                 cond_resched();
5559                 goto retry;
5560         }
5561         for (vma = mm->mmap; vma; vma = vma->vm_next) {
5562                 int ret;
5563                 struct mm_walk mem_cgroup_move_charge_walk = {
5564                         .pmd_entry = mem_cgroup_move_charge_pte_range,
5565                         .mm = mm,
5566                         .private = vma,
5567                 };
5568                 if (is_vm_hugetlb_page(vma))
5569                         continue;
5570                 ret = walk_page_range(vma->vm_start, vma->vm_end,
5571                                                 &mem_cgroup_move_charge_walk);
5572                 if (ret)
5573                         /*
5574                          * means we have consumed all precharges and failed in
5575                          * doing additional charge. Just abandon here.
5576                          */
5577                         break;
5578         }
5579         up_read(&mm->mmap_sem);
5580 }
5581
5582 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5583                                 struct cgroup *cont,
5584                                 struct cgroup_taskset *tset)
5585 {
5586         struct task_struct *p = cgroup_taskset_first(tset);
5587         struct mm_struct *mm = get_task_mm(p);
5588
5589         if (mm) {
5590                 if (mc.to)
5591                         mem_cgroup_move_charge(mm);
5592                 put_swap_token(mm);
5593                 mmput(mm);
5594         }
5595         if (mc.to)
5596                 mem_cgroup_clear_mc();
5597 }
5598 #else   /* !CONFIG_MMU */
5599 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5600                                 struct cgroup *cgroup,
5601                                 struct cgroup_taskset *tset)
5602 {
5603         return 0;
5604 }
5605 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5606                                 struct cgroup *cgroup,
5607                                 struct cgroup_taskset *tset)
5608 {
5609 }
5610 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5611                                 struct cgroup *cont,
5612                                 struct cgroup_taskset *tset)
5613 {
5614 }
5615 #endif
5616
5617 struct cgroup_subsys mem_cgroup_subsys = {
5618         .name = "memory",
5619         .subsys_id = mem_cgroup_subsys_id,
5620         .create = mem_cgroup_create,
5621         .pre_destroy = mem_cgroup_pre_destroy,
5622         .destroy = mem_cgroup_destroy,
5623         .populate = mem_cgroup_populate,
5624         .can_attach = mem_cgroup_can_attach,
5625         .cancel_attach = mem_cgroup_cancel_attach,
5626         .attach = mem_cgroup_move_task,
5627         .early_init = 0,
5628         .use_id = 1,
5629 };
5630
5631 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5632 static int __init enable_swap_account(char *s)
5633 {
5634         /* consider enabled if no parameter or 1 is given */
5635         if (!strcmp(s, "1"))
5636                 really_do_swap_account = 1;
5637         else if (!strcmp(s, "0"))
5638                 really_do_swap_account = 0;
5639         return 1;
5640 }
5641 __setup("swapaccount=", enable_swap_account);
5642
5643 #endif